def get_base_data(config): df = pd.read_csv(os.path.join(config["clean"]["baseDir"], config["vectorize"]["cleanHash"], "result.csv"), low_memory=False) udf = pd.read_csv(os.path.join(config["clean"]["baseDir"], config["vectorize"]["cleanHash"], "useable.csv"), low_memory=False) with open(os.path.join(config["vectorize"]["outputDir"], "info.json"), "r") as f: info = json.load(f) stopWords = util.getStopWords(config) return [{ "all": df.id.count(), "annot": df[~df.notAnnot].id.count(), "payloadMinLength": config["clean"]["payloadMinLength"], "duplicates": df[~df.notAnnot][df.duplicate].id.count(), "useable": udf[udf.useable].id.count(), "labelsets": udf.labels.nunique(), "labelsetsOnce": udf.groupby(df.labels).labels.count().value_counts().get(1), "labelCardinality": sum(udf.nol) / len(udf), "labelDensity": sum(udf.nol) / (len(udf) * len(util.getLabels(config)[1:])), "special": udf[udf.special].id.count(), "allFeatures": info["allFeatures_bow"], "noTrain": info["noTrain"], "noTest": info["noTest"], "noTrain_train": info["noTrain_train"], "noTrain_val": info["noTrain_val"], "noStopWords": len(stopWords), "wc_mean": udf.wc.mean(), "wc_median": udf.wc.median(), "wc_first_quartile": udf.wc.quantile(.25) }]
def unigram_noStop(md): """ arguments: md is a util.MovieData object returns: a dictionary containing a mapping from unigram features from the reviews to their values on this util.MovieData object, with stop words removed """ unigramCount = unigram_feats(md) for sword in util.getStopWords(): del unigramCount[sword] return unigramCount
def bigram_feats_noStop(md): c = Counter() for rev in util.MovieData.reviewers: if hasattr(md,rev): # count occurrences of asciified, lowercase, non-numeric unigrams # after removing punctuation stopWords = util.getStopWords() wordList = util.punct_patt.sub("", util.asciify(md.__dict__[rev].strip().lower())).split() wordList = [x for x in wordList if util.non_numeric(x) and util.notStopWord(x, stopWords)] bigrams = zip(wordList, wordList[1:]) c.update(token for token in bigrams) return c
from nltk.tokenize import RegexpTokenizer from nltk.stem.porter import PorterStemmer from gensim import corpora, models import gensim import codecs import util import dataImporter tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = util.getStopWords('stopwords.txt') userinput = str(raw_input('Enter your list: ')) channel_list = [] for input in userinput.split(','): channel_list.append(input) for channel in channel_list: dataImporter.writeChannelMessagesToFile(channel+'.txt', channel) doc_set = [] for channel in channel_list: doc = 'doc_' + channel doc = codecs.open(channel+'.txt', encoding='utf-8').read() doc_set.append(doc) # list for tokenized documents in loop texts = [] # loop through document list for doc in doc_set: