def categorymachine(folderlist):

    print "starting category machine"
    catdicti = {}
    catnumber = 0
    for folder in folderlist:
        filis = [
            i for i in os.listdir(os.path.join(pathi, folder))
            if not i.startswith(".")
        ]
        for fili in filis:
            inputfile = codecs.open(os.path.join(pathi, folder, fili), "r",
                                    "utf-8").read()
            inputtext = ct.adtextextractor(inputfile, fili)
            #lets establish the category
            #we need to make it numeric, so the numpy won't screw up
            category = ct.tagextractor(inputfile, "category1", fili)
            try:
                cat = catdicti[category]
            except:
                print "We added {} to the category dictionary, coded as {}".format(
                    category, catnumber)
                catdicti[ct.tagextractor(inputfile, "category1",
                                         fili)] = catnumber
                catnumber = catnumber + 1
                cat = catdicti[ct.tagextractor(inputfile, "category1", fili)]
    return (catdicti, catnumber)
Exemple #2
0
def matrixmachine(folderlist, featuredict, *args):
    #
    ###BUILDING MATRICES
    #
    # args are external categories
    #matrixmachine takes a list of folders	and of external categories to be included, note that it calls on the category machine
    print "Starting the matrixmachine"
    print "external categories: ", len(args)
    print args
    #the plus one in here is for the file id
    wordmatrix = np.empty(shape=(1, (len(featuredict) + len(args) + 1)))
    print "Matrix initial shape: ", np.shape(wordmatrix)
    # making a dictionary for the categories
    # we need the zero cause the machine returns 2 items
    count = 0
    catdicti = categorymachine(folderlist)[0]
    filedict = {}
    for folder in folderlist:
        filis = [
            i for i in os.listdir(os.path.join(pathi, folder))
            if not i.startswith(".")
        ]
        print "Building matrices: we have {} files in folder {}".format(
            len(filis), folder)
        for fili in filis[:20]:
            inputfile = codecs.open(os.path.join(pathi, folder, fili), "r",
                                    "utf-8").read()
            #establish category
            for external_cat in args:
                cat = catdicti[ct.tagextractor(inputfile, external_cat, fili)]
            count = count + 1
            filedict[count] = os.path.join(pathi, folder, fili)
            #note that punctuation is still in here
            splittext = nltk.word_tokenize(inputfile)
            splittextlo = [i.lower() for i in splittext]
            wordcount = float(len(splittextlo))
            # this is a per word frequency
            wordvector = np.array(
                [float(cat)] + [float(count)] +
                [float(splittextlo.count(i)) / wordcount for i in featuredict])
            #print wordvector
            #we append it to the matrix
            wordmatrix = np.append(wordmatrix, [wordvector], axis=0)
    print "Features of word matrix: shape {}, dtype {}".format(
        np.shape(wordmatrix), wordmatrix.dtype)
    print "---------------\nEnd of public service announcements\n\n"
    #"In 2D, the first dimension corresponds to rows, the second to columns."
    # we don't look at the first row cause that was just for initialization
    # the one without cats we put into the clustering algorithm
    wordmatrix_without_cat = wordmatrix[1:wordmatrix.shape[0],
                                        len(args) + 1:wordmatrix.shape[1]]
    print "without", np.shape(wordmatrix_without_cat)
    wordmatrix_with_cat = wordmatrix[1:wordmatrix.shape[0], ]
    print "with", np.shape(wordmatrix_with_cat)
    return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)
def categorymachine(folderlist):
	print "starting category machine"
	catdicti={}
	catnumber=0
	for folder in folderlist:
		filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith (".")]
		for fili in filis:
			inputfile=codecs.open(os.path.join(pathi, folder,fili), "r", "utf-8").read()
			inputtext=ct.adtextextractor(inputfile, fili)
			# lets establish the category
			# we need to make it numeric, so the numpy won't screw up
			category=ct.tagextractor(inputfile, "category1", fili)
			try: 
				cat=catdicti[category]
			except:
				print "We added {} to the category dictionary, coded as {}".format(category, catnumber)
				catdicti[ct.tagextractor(inputfile, "category1", fili)]=catnumber
				catnumber=catnumber+1
				cat=catdicti[ct.tagextractor(inputfile, "category1", fili)]
	return (catdicti, catnumber)
def matrixmachine(folderlist, featuredict, *args):
	#
	###BUILDING MATRICES
	#
	# args are external categories
	#matrixmachine takes a list of folders	and of external categories to be included, note that it calls on the category machine 
	print "Starting the matrixmachine"
	print "external categories: ", len(args)
	print args
	#the plus one in here is for the file id
	wordmatrix=np.empty(shape=(1,(len(featuredict)+len(args)+1)))
	print "Matrix initial shape: ", np.shape(wordmatrix)
	# making a dictionary for the categories
	# we need the zero cause the machine returns 2 items
	count=0
	catdicti=categorymachine(folderlist)[0]
	filedict={}
	for folder in folderlist:
		filis=[i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".")]
		print "Building matrices: we have {} files in folder {}".format(len(filis), folder)
		for fili in filis[:20]:
			inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read()
			#establish category
			for external_cat in args:
				cat=catdicti[ct.tagextractor(inputfile, external_cat, fili)]
			count=count+1
			filedict[count]=os.path.join(pathi, folder, fili)
			#note that punctuation is still in here
			splittext=nltk.word_tokenize(inputfile)
			splittextlo=[i.lower() for i in splittext]
			wordcount=float(len(splittextlo))
			# this is a per word frequency
			wordvector=np.array([float(cat)]+[float(count)]+[float(splittextlo.count(i))/wordcount for i in featuredict])
			#print wordvector
			#we append it to the matrix
			wordmatrix=np.append(wordmatrix, [wordvector], axis=0)
	print "Features of word matrix: shape {}, dtype {}".format(np.shape(wordmatrix), wordmatrix.dtype)
	print "---------------\nEnd of public service announcements\n\n"
	#"In 2D, the first dimension corresponds to rows, the second to columns."
	# we don't look at the first row cause that was just for initialization
	# the one without cats we put into the clustering algorithm
	wordmatrix_without_cat=wordmatrix[1:wordmatrix.shape[0],len(args)+1:wordmatrix.shape[1]]
	print "without", np.shape(wordmatrix_without_cat)
	wordmatrix_with_cat=wordmatrix[1:wordmatrix.shape[0],]
	print "with", np.shape(wordmatrix_with_cat)
	return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)
Exemple #5
0
def wordcounter(input_dir, category_tag, category_dict):
    """
	counts the words per category in the files in input_dir.
	
	Parameters
	----------
	input_dir is the corpus directoty
	category_tag is the name of the tag to be extracted with tagextractor. 
	category_dict is a dictionary of categories to be computed over (category names as keys)
	e.g. <location="X"> would be input with "location" as the category_tag and a dict with {"Austin":0, "Dallas":0, ...}
	Returns
	-------
	something
	"""
    print "Running the wordcounter"
    resultdict = category_dict
    for pati in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
        print pati
        for fili in [
                i for i in os.listdir(os.path.join(input_dir, pati))
                if not i.startswith(".")
        ]:
            with codecs.open(os.path.join(input_dir, pati, fili), "r",
                             "utf-8") as inputfili:
                inputfili = inputfili.read()
            wordcount = len(
                ct.tokenizer(ct.adtextextractor(inputfili, fili),
                             remove_punctuation=True))
            category = ct.tagextractor(inputfili, category_tag, fili)
            if category in resultdict:
                resultdict[category] = resultdict[category] + wordcount
            else:
                print "\n\nWARNING:\n{} is not in the category_dict. What do we do now?\n\n".format(
                    category)
    print "Wordcounter done"

    with codecs.open("wordcounter_" + category_tag + ".json", "w",
                     "utf-8") as jsonout:
        json.dump(resultdict, jsonout)
Exemple #6
0
def matrixmachine(folderlist, featuredict, external_category): 
	#constructing matrix
	print "Starting the matrixmachine"
	wordmatrix=np.empty(shape=(1,len(featuredict)+1))
	print "Matrix initial shape: ", np.shape(wordmatrix)
	# making a dictionary for the categories
	# we need the zero cause the machine returns 2 items
	catdicti=categorymachine(folderlist)[0]
	for folder in folderlist:
		filis=[i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".")]
		print "Building matrices: we have {} files in folder {}".format(len(filis), folder)
		for fili in filis:
			inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read()
			#establish category
			cat=catdicti[ct.tagextractor(inputfile, external_category, fili)]
			#collect a dictionary with all lowercased words
			#note that punctuation is still in here
			splittext=nltk.word_tokenize(inputfile)
			# we lowercase
			splittextlo=[i.lower() for i in splittext]
			#number of "words"
			wordcount=float(len(splittextlo))
			# we make the vector for this file
			# this is a per word frequency
			wordvector=np.array([float(cat)]+[float(splittextlo.count(i))/wordcount for i in featuredict])
			#print wordvector
			#we append it to the matrix
			wordmatrix=np.append(wordmatrix, [wordvector], axis=0)
	print "Features of word matrix: shape {}, dtype {}".format(np.shape(wordmatrix), wordmatrix.dtype)
	print "---------------\nEnd of public service announcements\n\n"
	#"In 2D, the first dimension corresponds to rows, the second to columns."
	# we don't look at the first row cause that was just for initialization
	# the one without cats we put into the clustering algorithm
	wordmatrix_without_cat=wordmatrix[1:wordmatrix.shape[0],1:wordmatrix.shape[1]]
	wordmatrix_with_cat=wordmatrix[1:wordmatrix.shape[0],]
	return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti)
def matrixmachine(folderlist, featuredict, testmode, *args):
    """
	The matrixmachine creates matrices of word frequencies / item frequencies.
	It returns 
	wordmatrix_without_cat, a matrix of word frequencies only. This is fed into clustering.
	wordmatrix_with_cat, a matrix of word frequencies, where external categories (defined in *args) are added. For later comparison of clusterings. 
	catdicti, a dictionary that maps categories to numbers used in the wordmatrix_with_cat. Created by the categorymachine(), cf for details. 
	filedict, a dictioanry that maps file names to rows in the matrix. For later comparison of clusterings. 
	It takes
	The folderlist is a collection of folders to iterate over. 
	The featuredict is a dictionary containing the words to count.
	If testmode is set to True, a short test run on a fragment of the dataset is conducted to see if this will run all the way. 
	(Note that the testmode comes all the way from main())
	The args are a number of external categories, each defined in the categorydicti created by categorymachine(). Here, usually a gender category. 
	Args will be added to the matrix_with_cat. 
	"""
    print "Starting the matrixmachine"
    print "external categories: ", len(args)
    print args
    #the plus one in here is for the file id
    wordmatrix = np.empty(shape=(1, (len(featuredict) + len(args) + 1)))
    print "Matrix initial shape: ", np.shape(wordmatrix)
    # making a dictionary for the categories
    # we need the zero cause the machine returns 2 items
    count = 0
    catdicti = categorymachine(folderlist)[0]
    filedict = {}
    featuredict = featuredict.keys()
    for folder in folderlist:
        filis = [
            i for i in os.listdir(os.path.join(pathi, folder))
            if not i.startswith(".")
        ]
        if testmode == True:
            print "\n\nRUNNING\nIN\nTEST\nMODE\n"
            filis = filis[:200]
        print "Building matrices: we have {} files in folder {}".format(
            len(filis), folder)
        for fili in filis:
            inputfile = codecs.open(os.path.join(pathi, folder, fili), "r",
                                    "utf-8").read()
            #establish category
            for external_cat in args:
                cat = catdicti[ct.tagextractor(inputfile, external_cat, fili)]
            count = count + 1
            filedict[count] = os.path.join(pathi, folder, fili)
            #note that punctuation is still in here
            splittext = nltk.word_tokenize(inputfile)
            splittextlo = [i.lower() for i in splittext]
            wordcount = float(len(splittextlo))
            # this is a per word frequency
            wordvector = np.array(
                [float(cat)] + [float(count)] +
                [float(splittextlo.count(i)) / wordcount for i in featuredict])
            #print wordvector
            #we append it to the matrix
            wordmatrix = np.append(wordmatrix, [wordvector], axis=0)
    print "Features of word matrix: shape {}, dtype {}".format(
        np.shape(wordmatrix), wordmatrix.dtype)
    print "---------------\nEnd of public service announcements\n\n"
    #"In 2D, the first dimension corresponds to rows, the second to columns."
    # we don't look at the first row cause that was just for initialization
    # the one without cats we put into the clustering algorithm
    wordmatrix_without_cat = wordmatrix[1:wordmatrix.shape[0],
                                        len(args) + 1:wordmatrix.shape[1]]
    print "without", np.shape(wordmatrix_without_cat)
    wordmatrix_with_cat = wordmatrix[1:wordmatrix.shape[0], ]
    print "with", np.shape(wordmatrix_with_cat)
    return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)
def matrixmachine(folderlist, featuredict, testmode, *args):

	"""
	The matrixmachine creates matrices of word frequencies.
	It returns 
	wordmatrix_without_cat, a matrix of word frequencies only. This is fed into clustering.
	wordmatrix_with_cat, a matrix of word frequencies, where external categories (defined in *args) are added. For later comparison of clusterings. 
	catdicti, a dictionary that maps categories to numbers used in the wordmatrix_with_cat. Created by the categorymachine(), cf for details. 
	filedict, a dictioanry that maps file names to rows in the matrix. For later comparison of clusterings. 
	It takes
	The folderlist is a collection of folders to iterate over. 
	The featuredict is a dictionary containing the words to count.
	If testmode is set to True, a short test run on a fragment of the dataset is conducted to see if this will run all the way. 
	(Note that the testmode comes all the way from main())
	The args are a number of external categories, each defined in the categorydicti created by categorymachine(). Here, usually a gender category. 
	Args will be added to the matrix_with_cat. 
	"""
	print "Starting the matrixmachine"
	print "external categories: ", len(args)
	print args
	#the plus one in here is for the file id
	wordmatrix=np.empty(shape=(1,(len(featuredict)+len(args)+1)))
	print "Matrix initial shape: ", np.shape(wordmatrix)
	# making a dictionary for the categories
	# we need the zero cause the machine returns 2 items
	count=0
	catdicti=categorymachine(folderlist)[0]
	filedict={}
	featuredict={k:featuredict[k]['words'] for k in featuredict.keys()}
	featuredict={k:set([i for i in featuredict[k] if not i in cluster_stopwords]) for k in featuredict.keys()}
	for folder in folderlist:
		filis=[i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".")]
		if testmode:
			print "\n\nRUNNING\nIN\nTEST\nMODE\n"
			filis=filis[:200]
		print "Building matrices: we have {} files in folder {}".format(len(filis), folder)
		for fili in filis:
			inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read()
			inputad=ct.adtextextractor(inputfile, fili)
			#establish category
			for external_cat in args:
				cat=catdicti[ct.tagextractor(inputfile, external_cat, fili)]
			count=count+1
			filedict[count]=os.path.join(pathi, folder, fili)
			addspace=stopregex.sub(r"\g<1> \g<2>", inputad)
			splittext=nltk.word_tokenize(addspace)
			splittext=[s for s in splittext if s not in exclude]
			splittextlo=[s.lower() for s in splittext if s and not excluderegex.match(s)]
			wordcount=float(len(splittextlo))
			#for each word2vec cluster: cluster/total words
			# this is a per word frequency
			#for t in featuredict:
				#print "\n", t#featuredict[t]
				#print [splittextlo.count(i) for i in featuredict[t]]
				#if sum ([splittextlo.count(i) for i in set(featuredict[t])]) > 10:
				#	print [i for i in splittextlo if i in featuredict[t]]
			#addict={k:[i for i in v] for k,v in featuredict.items()} 
			addict={k:sum([float(splittextlo.count(i))for i in v]) for k,v in featuredict.items()}
			addict={k:v/wordcount for k,v in addict.items()}
			#print addict
			wordvector=np.array([float(cat)]+[float(count)]+addict.values())
			#we append it to the matrix
			wordmatrix=np.append(wordmatrix, [wordvector], axis=0)
	print "Features of word matrix: shape {}, dtype {}".format(np.shape(wordmatrix), wordmatrix.dtype)
	print "---------------\nEnd of public service announcements\n\n"
	#"In 2D, the first dimension corresponds to rows, the second to columns."
	# we don't look at the first row cause that was just for initialization
	# the one without cats we put into the clustering algorithm
	wordmatrix_without_cat=wordmatrix[1:wordmatrix.shape[0],len(args)+1:wordmatrix.shape[1]]
	print "without", np.shape(wordmatrix_without_cat)
	wordmatrix_with_cat=wordmatrix[1:wordmatrix.shape[0],]
	print "with", np.shape(wordmatrix_with_cat)
	return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)
def matrixmachine(folderlist, featuredict, testmode, *args):

	"""
	The matrixmachine creates matrices of word frequencies / item frequencies.
	It returns 
	wordmatrix_without_cat, a matrix of word frequencies only. This is fed into clustering.
	wordmatrix_with_cat, a matrix of word frequencies, where external categories (defined in *args) are added. For later comparison of clusterings. 
	catdicti, a dictionary that maps categories to numbers used in the wordmatrix_with_cat. Created by the categorymachine(), cf for details. 
	filedict, a dictioanry that maps file names to rows in the matrix. For later comparison of clusterings. 
	It takes
	The folderlist is a collection of folders to iterate over. 
	The featuredict is a dictionary containing the words to count.
	If testmode is set to True, a short test run on a fragment of the dataset is conducted to see if this will run all the way. 
	(Note that the testmode comes all the way from main())
	The args are a number of external categories, each defined in the categorydicti created by categorymachine(). Here, usually a gender category. 
	Args will be added to the matrix_with_cat. 
	"""
	print "Starting the matrixmachine"
	print "external categories: ", len(args)
	print args
	#the plus one in here is for the file id
	wordmatrix=np.empty(shape=(1,(len(featuredict)+len(args)+1)))
	print "Matrix initial shape: ", np.shape(wordmatrix)
	# making a dictionary for the categories
	# we need the zero cause the machine returns 2 items
	count=0
	catdicti=categorymachine(folderlist)[0]
	filedict={}
	featuredict=featuredict.keys()
	for folder in folderlist:
		filis=[i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".")]
		if testmode == True:
			print "\n\nRUNNING\nIN\nTEST\nMODE\n"
			filis=filis[:200]
		print "Building matrices: we have {} files in folder {}".format(len(filis), folder)
		for fili in filis:
			inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read()
			#establish category
			for external_cat in args:
				cat=catdicti[ct.tagextractor(inputfile, external_cat, fili)]
			count=count+1
			filedict[count]=os.path.join(pathi, folder, fili)
			#note that punctuation is still in here
			#ARE WE READING IN THE ENTIRE AD HERE? NOT JUST THE TEXT?
			splittext=nltk.word_tokenize(inputfile)
			splittextlo=[i.lower() for i in splittext]
			wordcount=float(len(splittextlo))
			# this is a per word frequency
			wordvector=np.array([float(cat)]+[float(count)]+[float(splittextlo.count(i))/wordcount for i in featuredict])
			#print wordvector
			#we append it to the matrix
			wordmatrix=np.append(wordmatrix, [wordvector], axis=0)
	print "Features of word matrix: shape {}, dtype {}".format(np.shape(wordmatrix), wordmatrix.dtype)
	print "---------------\nEnd of public service announcements\n\n"
	#"In 2D, the first dimension corresponds to rows, the second to columns."
	# we don't look at the first row cause that was just for initialization
	# the one without cats we put into the clustering algorithm
	wordmatrix_without_cat=wordmatrix[1:wordmatrix.shape[0],len(args)+1:wordmatrix.shape[1]]
	print "without", np.shape(wordmatrix_without_cat)
	wordmatrix_with_cat=wordmatrix[1:wordmatrix.shape[0],]
	print "with", np.shape(wordmatrix_with_cat)
	return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)