def checkIdenticals(): old = ptd.getDataWithMeta() old_2011 = old[old.Publication_year == 2011] old_2011_wos = old_2011.WOS.tolist() new = ptd.getUnlabelledData() print("len of new data: {}".format(len(new))) new_2011 = new[new.Publication_year == "2011"] new_2011_wos = new_2011.WOS.tolist() print("old length 2011: {}".format(len(old_2011_wos))) print("new length 2011: {}".format(len(new_2011_wos))) print old_2011_wos[:5] print new_2011_wos[:5] identical = [] for wos in new_2011_wos: for wos2 in old_2011_wos: if wos == wos2: print("{}\n{}\n".format(wos, wos2)) identical.append(wos) print("Number of identical papers = {}".format(len(identical))) new_data = ptd.getUnlabelledDataAsList() print ("len of old before: {}".format(len(new_data))) new_data_after = [] for dic in new_data: if dic["WOS"] not in identical: new_data_after.append(dic) print ("len of old after: {}".format(len(new_data_after)))
def storeSubjectsToJson(stance="All"): if stance == "All": #d = ptd.getMetaDataAsList() d = ptd.getDataWithMeta() frame = pd.DataFrame(d).Subjects else: frame = getStanceData(stance).Subjects frame.fillna("nan", inplace=True) headers = frame.tolist() headers = [h for h in headers if h != "nan"] headers = [h.lower() for sublist in headers for h in sublist] uniq_head = list(set(headers)) d = dict(zip(uniq_head, np.zeros(len(uniq_head)))) for s in headers: d[s] += 1 #for key in d.keys(): # print("{} \t: {}".format(key, d[key])) print("There are a total of {} subjects, with {} unique ones".format(len(headers), len(uniq_head)))
def LDA_scikit(stance="All", use_in_experiment=False, frame=None): n_top_words = 10 n_topics = 1 start = time.time() table = string.maketrans("", "") if not use_in_experiment: if stance == "All": data = ptd.getData().Abstract else: data = ptd.getDataWithMeta() data = data[data.Stance == stance].Abstract else: data = frame.Abstract data_as_list = data.tolist() raw_docs = [] for d in data_as_list: raw_docs.append(str(d).translate(table, string.punctuation).lower()) collection = [] print("Extracting tf features for LDA...") for doc in raw_docs: tf_vectorizer = CountVectorizer(stop_words='english') tf_tfidf = TfidfVectorizer(stop_words='english') tf = tf_vectorizer.fit_transform([doc]) tf2 = tf_tfidf.fit_transform([doc]) model = LatentDirichletAllocation(n_topics=n_topics, random_state=1) model.fit(tf) tf_feature_names = tf_vectorizer.get_feature_names() topic_word_collection = print_top_words(model, tf_feature_names, n_top_words, "abstract_"+stance+"_count.pkl") #print #model.fit(tf2) #tf_feature_names2 = tf_vectorizer.get_feature_names() #topic_word_collection2 = print_top_words(model, tf_feature_names2, n_top_words, "abstract_"+stance+"_tfidf.pkl") collection.append(topic_word_collection) #print("\nTime used: {:.4f}".format((time.time()-start)/60.0)) return [model[0] for model in collection]