Example #1
0
def checkIdenticals():
    old = ptd.getDataWithMeta()
    old_2011 = old[old.Publication_year == 2011]
    old_2011_wos = old_2011.WOS.tolist()
    new = ptd.getUnlabelledData()
    print("len of new data: {}".format(len(new)))
    new_2011 = new[new.Publication_year == "2011"]
    new_2011_wos = new_2011.WOS.tolist()

    print("old length 2011: {}".format(len(old_2011_wos)))
    print("new length 2011: {}".format(len(new_2011_wos)))

    print old_2011_wos[:5]
    print new_2011_wos[:5]

    identical = []
    for wos in new_2011_wos:
        for wos2 in old_2011_wos:
            if wos == wos2:
                print("{}\n{}\n".format(wos, wos2))
                identical.append(wos)

    print("Number of identical papers = {}".format(len(identical)))

    new_data = ptd.getUnlabelledDataAsList()

    print ("len of old before: {}".format(len(new_data)))
    new_data_after = []
    for dic in new_data:
        if dic["WOS"] not in identical:
            new_data_after.append(dic)

    print ("len of old after: {}".format(len(new_data_after)))
Example #2
0
def storeSubjectsToJson(stance="All"):
    if stance == "All":
        #d = ptd.getMetaDataAsList()
        d = ptd.getDataWithMeta()
        frame = pd.DataFrame(d).Subjects
    else:
        frame = getStanceData(stance).Subjects
    frame.fillna("nan", inplace=True)
    headers = frame.tolist()
    headers = [h for h in headers if h != "nan"]
    headers = [h.lower() for sublist in headers for h in sublist]
    uniq_head = list(set(headers))
    d = dict(zip(uniq_head, np.zeros(len(uniq_head))))
    for s in headers:
        d[s] += 1

    #for key in d.keys():
    #    print("{} \t: {}".format(key, d[key]))
    print("There are a total of {} subjects, with {} unique ones".format(len(headers), len(uniq_head)))
Example #3
0
def LDA_scikit(stance="All", use_in_experiment=False, frame=None):
    n_top_words = 10
    n_topics = 1
    start = time.time()
    table = string.maketrans("", "")
    if not use_in_experiment:
        if stance == "All":
            data = ptd.getData().Abstract
        else:
            data = ptd.getDataWithMeta()
            data = data[data.Stance == stance].Abstract
    else:
        data = frame.Abstract

    data_as_list = data.tolist()
    raw_docs = []
    for d in data_as_list:
        raw_docs.append(str(d).translate(table, string.punctuation).lower())

    collection = []
    print("Extracting tf features for LDA...")
    for doc in raw_docs:
        tf_vectorizer = CountVectorizer(stop_words='english')
        tf_tfidf = TfidfVectorizer(stop_words='english')
        tf = tf_vectorizer.fit_transform([doc])
        tf2 = tf_tfidf.fit_transform([doc])
        model = LatentDirichletAllocation(n_topics=n_topics, random_state=1)
        model.fit(tf)

        tf_feature_names = tf_vectorizer.get_feature_names()
        topic_word_collection = print_top_words(model, tf_feature_names, n_top_words, "abstract_"+stance+"_count.pkl")
        #print
        #model.fit(tf2)
        #tf_feature_names2 = tf_vectorizer.get_feature_names()
        #topic_word_collection2 = print_top_words(model, tf_feature_names2, n_top_words, "abstract_"+stance+"_tfidf.pkl")
        collection.append(topic_word_collection)

    #print("\nTime used: {:.4f}".format((time.time()-start)/60.0))
    return [model[0] for model in collection]