def categorymachine(folderlist): print "starting category machine" catdicti = {} catnumber = 0 for folder in folderlist: filis = [ i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".") ] for fili in filis: inputfile = codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputtext = ct.adtextextractor(inputfile, fili) #lets establish the category #we need to make it numeric, so the numpy won't screw up category = ct.tagextractor(inputfile, "category1", fili) try: cat = catdicti[category] except: print "We added {} to the category dictionary, coded as {}".format( category, catnumber) catdicti[ct.tagextractor(inputfile, "category1", fili)] = catnumber catnumber = catnumber + 1 cat = catdicti[ct.tagextractor(inputfile, "category1", fili)] return (catdicti, catnumber)
def matrixmachine(folderlist, featuredict, *args): # ###BUILDING MATRICES # # args are external categories #matrixmachine takes a list of folders and of external categories to be included, note that it calls on the category machine print "Starting the matrixmachine" print "external categories: ", len(args) print args #the plus one in here is for the file id wordmatrix = np.empty(shape=(1, (len(featuredict) + len(args) + 1))) print "Matrix initial shape: ", np.shape(wordmatrix) # making a dictionary for the categories # we need the zero cause the machine returns 2 items count = 0 catdicti = categorymachine(folderlist)[0] filedict = {} for folder in folderlist: filis = [ i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".") ] print "Building matrices: we have {} files in folder {}".format( len(filis), folder) for fili in filis[:20]: inputfile = codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() #establish category for external_cat in args: cat = catdicti[ct.tagextractor(inputfile, external_cat, fili)] count = count + 1 filedict[count] = os.path.join(pathi, folder, fili) #note that punctuation is still in here splittext = nltk.word_tokenize(inputfile) splittextlo = [i.lower() for i in splittext] wordcount = float(len(splittextlo)) # this is a per word frequency wordvector = np.array( [float(cat)] + [float(count)] + [float(splittextlo.count(i)) / wordcount for i in featuredict]) #print wordvector #we append it to the matrix wordmatrix = np.append(wordmatrix, [wordvector], axis=0) print "Features of word matrix: shape {}, dtype {}".format( np.shape(wordmatrix), wordmatrix.dtype) print "---------------\nEnd of public service announcements\n\n" #"In 2D, the first dimension corresponds to rows, the second to columns." # we don't look at the first row cause that was just for initialization # the one without cats we put into the clustering algorithm wordmatrix_without_cat = wordmatrix[1:wordmatrix.shape[0], len(args) + 1:wordmatrix.shape[1]] print "without", np.shape(wordmatrix_without_cat) wordmatrix_with_cat = wordmatrix[1:wordmatrix.shape[0], ] print "with", np.shape(wordmatrix_with_cat) return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)
def categorymachine(folderlist): print "starting category machine" catdicti={} catnumber=0 for folder in folderlist: filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith (".")] for fili in filis: inputfile=codecs.open(os.path.join(pathi, folder,fili), "r", "utf-8").read() inputtext=ct.adtextextractor(inputfile, fili) # lets establish the category # we need to make it numeric, so the numpy won't screw up category=ct.tagextractor(inputfile, "category1", fili) try: cat=catdicti[category] except: print "We added {} to the category dictionary, coded as {}".format(category, catnumber) catdicti[ct.tagextractor(inputfile, "category1", fili)]=catnumber catnumber=catnumber+1 cat=catdicti[ct.tagextractor(inputfile, "category1", fili)] return (catdicti, catnumber)
def matrixmachine(folderlist, featuredict, *args): # ###BUILDING MATRICES # # args are external categories #matrixmachine takes a list of folders and of external categories to be included, note that it calls on the category machine print "Starting the matrixmachine" print "external categories: ", len(args) print args #the plus one in here is for the file id wordmatrix=np.empty(shape=(1,(len(featuredict)+len(args)+1))) print "Matrix initial shape: ", np.shape(wordmatrix) # making a dictionary for the categories # we need the zero cause the machine returns 2 items count=0 catdicti=categorymachine(folderlist)[0] filedict={} for folder in folderlist: filis=[i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".")] print "Building matrices: we have {} files in folder {}".format(len(filis), folder) for fili in filis[:20]: inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() #establish category for external_cat in args: cat=catdicti[ct.tagextractor(inputfile, external_cat, fili)] count=count+1 filedict[count]=os.path.join(pathi, folder, fili) #note that punctuation is still in here splittext=nltk.word_tokenize(inputfile) splittextlo=[i.lower() for i in splittext] wordcount=float(len(splittextlo)) # this is a per word frequency wordvector=np.array([float(cat)]+[float(count)]+[float(splittextlo.count(i))/wordcount for i in featuredict]) #print wordvector #we append it to the matrix wordmatrix=np.append(wordmatrix, [wordvector], axis=0) print "Features of word matrix: shape {}, dtype {}".format(np.shape(wordmatrix), wordmatrix.dtype) print "---------------\nEnd of public service announcements\n\n" #"In 2D, the first dimension corresponds to rows, the second to columns." # we don't look at the first row cause that was just for initialization # the one without cats we put into the clustering algorithm wordmatrix_without_cat=wordmatrix[1:wordmatrix.shape[0],len(args)+1:wordmatrix.shape[1]] print "without", np.shape(wordmatrix_without_cat) wordmatrix_with_cat=wordmatrix[1:wordmatrix.shape[0],] print "with", np.shape(wordmatrix_with_cat) return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)
def wordcounter(input_dir, category_tag, category_dict): """ counts the words per category in the files in input_dir. Parameters ---------- input_dir is the corpus directoty category_tag is the name of the tag to be extracted with tagextractor. category_dict is a dictionary of categories to be computed over (category names as keys) e.g. <location="X"> would be input with "location" as the category_tag and a dict with {"Austin":0, "Dallas":0, ...} Returns ------- something """ print "Running the wordcounter" resultdict = category_dict for pati in [i for i in os.listdir(input_dir) if not i.startswith(".")]: print pati for fili in [ i for i in os.listdir(os.path.join(input_dir, pati)) if not i.startswith(".") ]: with codecs.open(os.path.join(input_dir, pati, fili), "r", "utf-8") as inputfili: inputfili = inputfili.read() wordcount = len( ct.tokenizer(ct.adtextextractor(inputfili, fili), remove_punctuation=True)) category = ct.tagextractor(inputfili, category_tag, fili) if category in resultdict: resultdict[category] = resultdict[category] + wordcount else: print "\n\nWARNING:\n{} is not in the category_dict. What do we do now?\n\n".format( category) print "Wordcounter done" with codecs.open("wordcounter_" + category_tag + ".json", "w", "utf-8") as jsonout: json.dump(resultdict, jsonout)
def matrixmachine(folderlist, featuredict, external_category): #constructing matrix print "Starting the matrixmachine" wordmatrix=np.empty(shape=(1,len(featuredict)+1)) print "Matrix initial shape: ", np.shape(wordmatrix) # making a dictionary for the categories # we need the zero cause the machine returns 2 items catdicti=categorymachine(folderlist)[0] for folder in folderlist: filis=[i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".")] print "Building matrices: we have {} files in folder {}".format(len(filis), folder) for fili in filis: inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() #establish category cat=catdicti[ct.tagextractor(inputfile, external_category, fili)] #collect a dictionary with all lowercased words #note that punctuation is still in here splittext=nltk.word_tokenize(inputfile) # we lowercase splittextlo=[i.lower() for i in splittext] #number of "words" wordcount=float(len(splittextlo)) # we make the vector for this file # this is a per word frequency wordvector=np.array([float(cat)]+[float(splittextlo.count(i))/wordcount for i in featuredict]) #print wordvector #we append it to the matrix wordmatrix=np.append(wordmatrix, [wordvector], axis=0) print "Features of word matrix: shape {}, dtype {}".format(np.shape(wordmatrix), wordmatrix.dtype) print "---------------\nEnd of public service announcements\n\n" #"In 2D, the first dimension corresponds to rows, the second to columns." # we don't look at the first row cause that was just for initialization # the one without cats we put into the clustering algorithm wordmatrix_without_cat=wordmatrix[1:wordmatrix.shape[0],1:wordmatrix.shape[1]] wordmatrix_with_cat=wordmatrix[1:wordmatrix.shape[0],] return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti)
def matrixmachine(folderlist, featuredict, testmode, *args): """ The matrixmachine creates matrices of word frequencies / item frequencies. It returns wordmatrix_without_cat, a matrix of word frequencies only. This is fed into clustering. wordmatrix_with_cat, a matrix of word frequencies, where external categories (defined in *args) are added. For later comparison of clusterings. catdicti, a dictionary that maps categories to numbers used in the wordmatrix_with_cat. Created by the categorymachine(), cf for details. filedict, a dictioanry that maps file names to rows in the matrix. For later comparison of clusterings. It takes The folderlist is a collection of folders to iterate over. The featuredict is a dictionary containing the words to count. If testmode is set to True, a short test run on a fragment of the dataset is conducted to see if this will run all the way. (Note that the testmode comes all the way from main()) The args are a number of external categories, each defined in the categorydicti created by categorymachine(). Here, usually a gender category. Args will be added to the matrix_with_cat. """ print "Starting the matrixmachine" print "external categories: ", len(args) print args #the plus one in here is for the file id wordmatrix = np.empty(shape=(1, (len(featuredict) + len(args) + 1))) print "Matrix initial shape: ", np.shape(wordmatrix) # making a dictionary for the categories # we need the zero cause the machine returns 2 items count = 0 catdicti = categorymachine(folderlist)[0] filedict = {} featuredict = featuredict.keys() for folder in folderlist: filis = [ i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".") ] if testmode == True: print "\n\nRUNNING\nIN\nTEST\nMODE\n" filis = filis[:200] print "Building matrices: we have {} files in folder {}".format( len(filis), folder) for fili in filis: inputfile = codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() #establish category for external_cat in args: cat = catdicti[ct.tagextractor(inputfile, external_cat, fili)] count = count + 1 filedict[count] = os.path.join(pathi, folder, fili) #note that punctuation is still in here splittext = nltk.word_tokenize(inputfile) splittextlo = [i.lower() for i in splittext] wordcount = float(len(splittextlo)) # this is a per word frequency wordvector = np.array( [float(cat)] + [float(count)] + [float(splittextlo.count(i)) / wordcount for i in featuredict]) #print wordvector #we append it to the matrix wordmatrix = np.append(wordmatrix, [wordvector], axis=0) print "Features of word matrix: shape {}, dtype {}".format( np.shape(wordmatrix), wordmatrix.dtype) print "---------------\nEnd of public service announcements\n\n" #"In 2D, the first dimension corresponds to rows, the second to columns." # we don't look at the first row cause that was just for initialization # the one without cats we put into the clustering algorithm wordmatrix_without_cat = wordmatrix[1:wordmatrix.shape[0], len(args) + 1:wordmatrix.shape[1]] print "without", np.shape(wordmatrix_without_cat) wordmatrix_with_cat = wordmatrix[1:wordmatrix.shape[0], ] print "with", np.shape(wordmatrix_with_cat) return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)
def matrixmachine(folderlist, featuredict, testmode, *args): """ The matrixmachine creates matrices of word frequencies. It returns wordmatrix_without_cat, a matrix of word frequencies only. This is fed into clustering. wordmatrix_with_cat, a matrix of word frequencies, where external categories (defined in *args) are added. For later comparison of clusterings. catdicti, a dictionary that maps categories to numbers used in the wordmatrix_with_cat. Created by the categorymachine(), cf for details. filedict, a dictioanry that maps file names to rows in the matrix. For later comparison of clusterings. It takes The folderlist is a collection of folders to iterate over. The featuredict is a dictionary containing the words to count. If testmode is set to True, a short test run on a fragment of the dataset is conducted to see if this will run all the way. (Note that the testmode comes all the way from main()) The args are a number of external categories, each defined in the categorydicti created by categorymachine(). Here, usually a gender category. Args will be added to the matrix_with_cat. """ print "Starting the matrixmachine" print "external categories: ", len(args) print args #the plus one in here is for the file id wordmatrix=np.empty(shape=(1,(len(featuredict)+len(args)+1))) print "Matrix initial shape: ", np.shape(wordmatrix) # making a dictionary for the categories # we need the zero cause the machine returns 2 items count=0 catdicti=categorymachine(folderlist)[0] filedict={} featuredict={k:featuredict[k]['words'] for k in featuredict.keys()} featuredict={k:set([i for i in featuredict[k] if not i in cluster_stopwords]) for k in featuredict.keys()} for folder in folderlist: filis=[i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".")] if testmode: print "\n\nRUNNING\nIN\nTEST\nMODE\n" filis=filis[:200] print "Building matrices: we have {} files in folder {}".format(len(filis), folder) for fili in filis: inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputad=ct.adtextextractor(inputfile, fili) #establish category for external_cat in args: cat=catdicti[ct.tagextractor(inputfile, external_cat, fili)] count=count+1 filedict[count]=os.path.join(pathi, folder, fili) addspace=stopregex.sub(r"\g<1> \g<2>", inputad) splittext=nltk.word_tokenize(addspace) splittext=[s for s in splittext if s not in exclude] splittextlo=[s.lower() for s in splittext if s and not excluderegex.match(s)] wordcount=float(len(splittextlo)) #for each word2vec cluster: cluster/total words # this is a per word frequency #for t in featuredict: #print "\n", t#featuredict[t] #print [splittextlo.count(i) for i in featuredict[t]] #if sum ([splittextlo.count(i) for i in set(featuredict[t])]) > 10: # print [i for i in splittextlo if i in featuredict[t]] #addict={k:[i for i in v] for k,v in featuredict.items()} addict={k:sum([float(splittextlo.count(i))for i in v]) for k,v in featuredict.items()} addict={k:v/wordcount for k,v in addict.items()} #print addict wordvector=np.array([float(cat)]+[float(count)]+addict.values()) #we append it to the matrix wordmatrix=np.append(wordmatrix, [wordvector], axis=0) print "Features of word matrix: shape {}, dtype {}".format(np.shape(wordmatrix), wordmatrix.dtype) print "---------------\nEnd of public service announcements\n\n" #"In 2D, the first dimension corresponds to rows, the second to columns." # we don't look at the first row cause that was just for initialization # the one without cats we put into the clustering algorithm wordmatrix_without_cat=wordmatrix[1:wordmatrix.shape[0],len(args)+1:wordmatrix.shape[1]] print "without", np.shape(wordmatrix_without_cat) wordmatrix_with_cat=wordmatrix[1:wordmatrix.shape[0],] print "with", np.shape(wordmatrix_with_cat) return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)
def matrixmachine(folderlist, featuredict, testmode, *args): """ The matrixmachine creates matrices of word frequencies / item frequencies. It returns wordmatrix_without_cat, a matrix of word frequencies only. This is fed into clustering. wordmatrix_with_cat, a matrix of word frequencies, where external categories (defined in *args) are added. For later comparison of clusterings. catdicti, a dictionary that maps categories to numbers used in the wordmatrix_with_cat. Created by the categorymachine(), cf for details. filedict, a dictioanry that maps file names to rows in the matrix. For later comparison of clusterings. It takes The folderlist is a collection of folders to iterate over. The featuredict is a dictionary containing the words to count. If testmode is set to True, a short test run on a fragment of the dataset is conducted to see if this will run all the way. (Note that the testmode comes all the way from main()) The args are a number of external categories, each defined in the categorydicti created by categorymachine(). Here, usually a gender category. Args will be added to the matrix_with_cat. """ print "Starting the matrixmachine" print "external categories: ", len(args) print args #the plus one in here is for the file id wordmatrix=np.empty(shape=(1,(len(featuredict)+len(args)+1))) print "Matrix initial shape: ", np.shape(wordmatrix) # making a dictionary for the categories # we need the zero cause the machine returns 2 items count=0 catdicti=categorymachine(folderlist)[0] filedict={} featuredict=featuredict.keys() for folder in folderlist: filis=[i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".")] if testmode == True: print "\n\nRUNNING\nIN\nTEST\nMODE\n" filis=filis[:200] print "Building matrices: we have {} files in folder {}".format(len(filis), folder) for fili in filis: inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() #establish category for external_cat in args: cat=catdicti[ct.tagextractor(inputfile, external_cat, fili)] count=count+1 filedict[count]=os.path.join(pathi, folder, fili) #note that punctuation is still in here #ARE WE READING IN THE ENTIRE AD HERE? NOT JUST THE TEXT? splittext=nltk.word_tokenize(inputfile) splittextlo=[i.lower() for i in splittext] wordcount=float(len(splittextlo)) # this is a per word frequency wordvector=np.array([float(cat)]+[float(count)]+[float(splittextlo.count(i))/wordcount for i in featuredict]) #print wordvector #we append it to the matrix wordmatrix=np.append(wordmatrix, [wordvector], axis=0) print "Features of word matrix: shape {}, dtype {}".format(np.shape(wordmatrix), wordmatrix.dtype) print "---------------\nEnd of public service announcements\n\n" #"In 2D, the first dimension corresponds to rows, the second to columns." # we don't look at the first row cause that was just for initialization # the one without cats we put into the clustering algorithm wordmatrix_without_cat=wordmatrix[1:wordmatrix.shape[0],len(args)+1:wordmatrix.shape[1]] print "without", np.shape(wordmatrix_without_cat) wordmatrix_with_cat=wordmatrix[1:wordmatrix.shape[0],] print "with", np.shape(wordmatrix_with_cat) return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)