def compile_idf_dict(self): print "compiling idf dictionary from individual author files" idf_dict = {} path = '../TLG_idf_files/' + str(self.num_grams) + 'grams/' + str( self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str( self.stopwords) + "/" quarterHours = 0 filedict = file_dict('../ref_file.txt') files = filedict[self.subcorpora] for infile in files: print "now on: ", infile file = open(path + infile, 'r') num_docs = file.readline() #first line is number of docs in corpus for line in file: tokens = rpartition(line, "\t") term = tokens[0].strip() frequency = int(tokens[2].strip()) if term in idf_dict: idf_dict[term] += frequency else: idf_dict[term] = frequency quarterHours = self.check_time(quarterHours) self.idf_dict = idf_dict print time.clock() return len(files)
def __init__(self,spread,variant_word_order, corpus_filename = None,corpus = None, stopword_filename = None, DEFAULT_IDF = 1.5): """Initialize the idf dictionary. If a corpus file is supplied, reads the idf dictionary from it, in the format of: # of total documents term: # of documents containing the term If a stopword file is specified, reads the stopword list from it, in the format of one stopword per line. The DEFAULT_IDF value is returned when a query term is not found in the idf corpus. """ self.spread = spread self.vwo = variant_word_order self.num_docs = 0 self.term_num_docs = {} # term : num_docs_containing_term self.idf_default = DEFAULT_IDF if stopword_filename: stopword_file = open(stopword_filename, "r") self.stopwords = [line.strip() for line in stopword_file] #we should have it so that you can call create_idf_corpus(subcorpus, spread, etc), and it could make an idf object with #attributes such as idf_dict, spread, etc. We should also have a create_stopword_file(). Finally, we should have a compare() #method and a graph() method and a generate_tfidf_scores() method for one author. if corpus: self.term_num_docs = corpus elif corpus_filename: corpus_file = open(corpus_filename, "r") # Load number of documents. line = corpus_file.readline() self.num_docs = int(line.strip()) # Reads "term:frequency" from each subsequent line in the file. for line in corpus_file: tokens = rpartition(line, "\t") term = tokens[0].strip() frequency = int(tokens[2].strip()) self.term_num_docs[term] = frequency
def compile_idf_dict(self): print "compiling idf dictionary from individual author files" idf_dict = {} path = ( "../TLG_idf_files/" + str(self.num_grams) + "grams/" + str(self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str(self.stopwords) + "/" ) quarterHours = 0 filedict = file_dict("../ref_file.txt") files = filedict[self.subcorpora] for infile in files: print "now on: ", infile file = open(path + infile, "r") num_docs = file.readline() # first line is number of docs in corpus for line in file: tokens = rpartition(line, "\t") term = tokens[0].strip() frequency = int(tokens[2].strip()) if term in idf_dict: idf_dict[term] += frequency else: idf_dict[term] = frequency quarterHours = self.check_time(quarterHours) self.idf_dict = idf_dict print time.clock() return len(files)