else: self.wordcounts[word] = 1 elif tok.group(2): tok_type = "sent" last_end = tok_end tail = temp_buffer[last_end:] # print "keeping tail", tail self.buffers = [tail] if __name__ == "__main__": a = sys.argv[1] b = sys.argv[2] print "start" a_tokens = TokenCounter("(\w+)|([\.\!\?])") a_census = TagCensus(text_target=a_tokens) a_census.parse(open(a).read()) print a, "done" # print a,a_tokens.wordcounts b_tokens = TokenCounter("(\w+)|([\.\!\?])") b_census = TagCensus(text_target=b_tokens) b_census.parse(open(b).read()) print b, "done" # print b,b_tokens.wordcounts for word in a_tokens.wordcounts.keys(): if word in b_tokens.wordcounts.keys(): if a_tokens.wordcounts[word] != b_tokens.wordcounts[word]: print word,a_tokens.wordcounts[word],b_tokens.wordcounts[word] else:
self.wordcounts[word] = 1 elif tok.group(2): tok_type = "sent" last_end = tok_end tail = temp_buffer[last_end:] # print "keeping tail", tail self.buffers = [tail] if __name__ == "__main__": a = sys.argv[1] b = sys.argv[2] print("start") a_tokens = TokenCounter("(\w+)|([\.\!\?])") a_census = TagCensus(text_target=a_tokens) a_census.parse(open(a).read()) print(a, "done") # print a,a_tokens.wordcounts b_tokens = TokenCounter("(\w+)|([\.\!\?])") b_census = TagCensus(text_target=b_tokens) b_census.parse(open(b).read()) print(b, "done") # print b,b_tokens.wordcounts for word in a_tokens.wordcounts.keys(): if word in list(b_tokens.wordcounts.keys()): if a_tokens.wordcounts[word] != b_tokens.wordcounts[word]: print(word, a_tokens.wordcounts[word], b_tokens.wordcounts[word])
length = len(str(term)) ## integer if length > longest_term: longest_term = length return longest_term if __name__ == '__main__': a = sys.argv[1] b = sys.argv[2] output = open("%s_%s_diff" % (os.path.basename(a), os.path.basename(b)), 'w') print "\n### Comparing %s to %s ####\n" % (a,b) a_text = open(a).read().decode('utf-8', 'ignore') a_tokens = token_counter(a_text) a_census = TagCensus() a_census.parse(a_text) print >> output, "TEXT 1:", a, "done parsing..." print >> output, len(a_tokens), "total word tokens\n" b_text = open(b).read().decode('utf-8', 'ignore') b_tokens = token_counter(b_text) b_census = TagCensus() b_census.parse(b_text) print >> output, "TEXT 2:", b, "done parsing..." print >> output, len(b_tokens), "total tokens\n" print >> output, "### Token differences ###\n" token_col_length = find_longest_string(set([word for word in a_tokens] + [word for word in b_tokens])) num_col_length = find_longest_string(set([a_tokens[i] for i in a_tokens] + [b_tokens[i] for i in b_tokens])) results = []