Example #1
0
def stopwords():
    cursor, conn = SQLite.connect_to_databse(database_location)
    datatable = SQLite.list_all_rows(cursor, 'papers')
    paperdata = []
    for row in datatable:
        paperdata.append(row[6])

    cleandoc = cleanupdocuments(paperdata)
    id2word = corpora.Dictionary(cleandoc)
    corpus = [id2word.doc2bow(text) for text in cleandoc]
    dictionary = corpora.Dictionary(cleandoc)

    pickle.dump(corpus, open('ALL_corpus.pkl', 'wb'))
    dictionary.save('ALL_dictionary.gensim')

    cp_all = []
    for i in range (0,len(cleandoc)):
        for j in range (0,len(cleandoc[i])):
            cp_all.append(cleandoc[i][j])

    cleandoc.insert(0,cp_all)
    print('done part dos, **thumbs up**')

    dct = Dictionary.load('ALL_dictionary.gensim')
    corpus = [dct.doc2bow(line) for line in cleandoc]
    model = TfidfModel(corpus)
    vector = model[corpus[0]]
    print('done part tres, **smiley face**')

    cp_stop = []
    for token_id, token_weight in vector:
        cp_stop.append((dct.get(token_id),token_weight))
    print('done part quatros, yeehaw!')

    import csv
    headers = ('word','score')

    with open('stopwords.csv','w',newline='',encoding='utf-8') as outFile:
        wtr = csv.writer(outFile)
        wtr.writerow(headers)
        wtr.writerows(cp_stop)

    with open('stopwords.csv', 'r', newline='', encoding='utf-8') as inFile:
        csvreader = csv.reader(inFile)
        itr = iter(csvreader)
        next(itr)
        # stopwordvalue = [row for row in itr if float(row[1]) > 0.007]
        stopwordvalue = [row for row in itr if float(row[1]) > float(sys.argv[2])]

    with open('stopwords.csv','w',newline='',encoding='utf-8') as OutFile:
        wt = csv.writer(OutFile)
        wt.writerow(headers)
        wt.writerows(stopwordvalue)

    print('STOP WORDS FOUND!!! Stored in stopwords.csv')
Example #2
0
import matplotlib as plt
import numpy as np
import time
import sys
# =============================================================================
# User variables
# Location of SQLite database
# =============================================================================
database_location = '/home/greenbur/NLP/Python Code/WorkingPapersGOMlg.sqlite'

# Path where model will be saved
savemodelpath = '/home/greenbur/NLP/Results/GOMlgvec.txt'

# Load document data from database
# connect to swlite database and load data
cursor, conn = SQLite.connect_to_databse(database_location)
datatable = SQLite.list_all_rows(cursor, 'papers')

# Collect paper text and load to python list
paperdata = []
for row in datatable:
    paperdata.append(row[6])

# Clean text for processing
cleandoc = cleanupdocuments(paperdata)
print("Documents loaded and ready to process")

# This section builds the Word2Vec model and saves the model
print("Starting word2vec")

# Build Word2Vec model, params adjjusted for future testing