def dictmaker(folderlist, threshold, remove_stopwords=True, remove_punct=True): """ The dictmaker counts the words / items contained in the files found in the folders of folderlist. It returns a dictionary of all words that occur more often than the number threshold. remove_stopwords used the stopword list defined above to ignore words. remove_punct works with string.punctuation, cf above. """ #threshold sets how many times a word needs to occur to be included in the featuredict vocab = {} for folder in folderlist: filis = [ i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".") ] print "Building vocab: we have {} files in folder {}".format( len(filis), folder) #collect a dictionary with all words #lowercase them for fili in filis: inputfile = codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputtext = ct.adtextextractor(inputfile, fili) splittext = nltk.word_tokenize(inputtext) splittextlo = [i.lower() for i in splittext] #do we want to lemmatize or things like that for word in splittextlo: if word not in vocab: vocab[word] = 1 else: vocab[word] = vocab[word] + 1 print "Our vocab dictionary has {} entries".format(len(vocab)) ct.dictwriter( os.path.join("~/", chapterdir[0], "outputfiles", "fulldict_" + time.strftime("%H_%M_%m_%d")), vocab) if remove_stopwords: vocab = { key: value for key, value in vocab.items() if key not in stopwords } print "After stop word removal, dict is {} long".format(len(vocab)) if remove_punct: vocab = { key: value for key, value in vocab.items() if key not in punctuation } print "After punctuation removal, dict is {} long".format(len(vocab)) featuredict = { key: value for key, value in vocab.items() if value > float(threshold) } print "Our feature dictionary has {} entries\n---------------\n".format( len(featuredict)) print "This is our featuredict", featuredict ct.dictwriter( os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_" + time.strftime("%H_%M_%m_%d")), featuredict) return featuredict
def vec2wordclustercounter(folderlist, cluster_dictionary): """ This is stolen from the cluster_analysis dictmaker. The dictmaker counts the words / items contained in the files found in the folders of folderlist. remove_stopwords uses the stopword list defined above to ignore words. remove_punct works with string.punctuation, cf above. This was mainly used to test how well the counting in the word2vec analysis works. """ with codecs.open(cluster_dictionary, "r", "utf-8") as inputjson: clusterdict = json.load(inputjson) result = defaultdict(int) #this is just for qc misses = [] for folder in folderlist: filis = [ i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".") ] print "Building vocab: we have {} files in folder {}".format( len(filis), folder) for fili in filis: inputfile = codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputtext = ct.adtextextractor(inputfile, fili) #pre-processing here inputtext = ct.adcleaner(inputtext, replace_linebreak=True, remove_html=False) splittext = word_tokenize(inputtext) splittextlo = [i.lower() for i in splittext] finaltext = [punctuationregex.sub("", i) for i in splittextlo] finaltext = [i for i in finaltext if i and i not in ['br']] #do we want to lemmatize or things like that for word in finaltext: cluster = [ k for k, v in clusterdict.items() if word in v['words'] ] if len(cluster) > 1: print "Warning: The item {} was found in more than one clusters".format( word) if len(cluster) < 1: #print "Warning: The item could not be found in a cluster" misses.append(word) else: result[cluster[0]] = result[cluster[0]] + 1 print "Our vocab dictionary has {} entries".format(len(result)) ct.dictwriter( os.path.join("~/", chapterdir[0], "outputfiles", "fulldict_" + time.strftime("%H_%M_%m_%d")), result) # featuredict= {key:value for key, value in vocab.items() if value > float(threshold) } # print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict)) # print "This is our featuredict", featuredict # ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_"+time.strftime("%H_%M_%m_%d")), featuredict) print "misses", len(misses), set(misses) print result return result
def vec2wordclustercounter(folderlist, cluster_dictionary): """ This is stolen from the cluster_analysis dictmaker. The dictmaker counts the words / items contained in the files found in the folders of folderlist. remove_stopwords uses the stopword list defined above to ignore words. remove_punct works with string.punctuation, cf above. This was mainly used to test how well the counting in the word2vec analysis works. """ with codecs.open(cluster_dictionary, "r", "utf-8") as inputjson: clusterdict=json.load(inputjson) result=defaultdict(int) #this is just for qc misses=[] for folder in folderlist: filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith(".")] print "Building vocab: we have {} files in folder {}".format(len(filis), folder) for fili in filis: inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputtext=ct.adtextextractor(inputfile, fili) #pre-processing here inputtext=ct.adcleaner(inputtext ,replace_linebreak=True, remove_html=False) splittext=word_tokenize(inputtext) splittextlo=[i.lower() for i in splittext] finaltext=[punctuationregex.sub("",i) for i in splittextlo] finaltext=[i for i in finaltext if i and i not in ['br']] #do we want to lemmatize or things like that for word in finaltext: cluster= [k for k,v in clusterdict.items() if word in v['words']] if len(cluster) > 1: print "Warning: The item {} was found in more than one clusters".format(word) if len(cluster) < 1: #print "Warning: The item could not be found in a cluster" misses.append(word) else: result[cluster[0]]=result[cluster[0]]+1 print "Our vocab dictionary has {} entries".format(len(result)) ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "fulldict_"+time.strftime("%H_%M_%m_%d")), result) # featuredict= {key:value for key, value in vocab.items() if value > float(threshold) } # print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict)) # print "This is our featuredict", featuredict # ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_"+time.strftime("%H_%M_%m_%d")), featuredict) print "misses", len(misses), set(misses) print result return result
def dictmaker(folderlist, threshold, remove_stopwords=True, remove_punct=True): """ The dictmaker counts the words / items contained in the files found in the folders of folderlist. It returns a dictionary of all words that occur more often than the number threshold. remove_stopwords used the stopword list defined above to ignore words. remove_punct works with string.punctuation, cf above. """ #threshold sets how many times a word needs to occur to be included in the featuredict vocab={} for folder in folderlist: filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith(".")] print "Building vocab: we have {} files in folder {}".format(len(filis), folder) #collect a dictionary with all words #lowercase them for fili in filis: inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputtext=ct.adtextextractor(inputfile, fili) #pre-processing here inputtext=adcleaner(inputtext ,replace_linebreak=True, remove_html=False) splittext=word_tokenize(inputtext) splittextlo=[i.lower() for i in splittext] finaltext=[punctuationregex.sub("",i) for i in splittextlo] finaltext=[i for i in finaltext if i and i not in ['br']] #do we want to lemmatize or things like that for word in finaltext: if word not in vocab: vocab[word]=1 else: vocab[word]=vocab[word]+1 print "Our vocab dictionary has {} entries".format(len(vocab)) ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "fulldict_"+time.strftime("%H_%M_%m_%d")), vocab) if remove_stopwords: vocab= {key:value for key, value in vocab.items() if key not in stopwords } print "After stop word removal, dict is {} long".format(len(vocab)) if remove_punct: vocab= {key:value for key, value in vocab.items() if key not in punctuation } print "After punctuation removal, dict is {} long".format(len(vocab)) featuredict= {key:value for key, value in vocab.items() if value > float(threshold) } print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict)) print "This is our featuredict", featuredict ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_"+time.strftime("%H_%M_%m_%d")), featuredict) return featuredict