Ejemplo n.º 1
0
                else:
                    self.wordcounts[word] = 1
            elif tok.group(2):
                tok_type = "sent"                
            last_end = tok_end
        tail = temp_buffer[last_end:]
        # print "keeping tail", tail
        self.buffers = [tail]

if __name__ == "__main__":
    a = sys.argv[1]
    b = sys.argv[2]

    print "start"
    a_tokens = TokenCounter("(\w+)|([\.\!\?])")
    a_census = TagCensus(text_target=a_tokens)
    a_census.parse(open(a).read())
    print a, "done"
#    print a,a_tokens.wordcounts

    b_tokens = TokenCounter("(\w+)|([\.\!\?])")
    b_census = TagCensus(text_target=b_tokens)
    b_census.parse(open(b).read())
    print b, "done"
#    print b,b_tokens.wordcounts

    for word in a_tokens.wordcounts.keys():
        if word in b_tokens.wordcounts.keys():
            if a_tokens.wordcounts[word] != b_tokens.wordcounts[word]:
                print word,a_tokens.wordcounts[word],b_tokens.wordcounts[word]
        else:
                    self.wordcounts[word] = 1
            elif tok.group(2):
                tok_type = "sent"
            last_end = tok_end
        tail = temp_buffer[last_end:]
        # print "keeping tail", tail
        self.buffers = [tail]


if __name__ == "__main__":
    a = sys.argv[1]
    b = sys.argv[2]

    print("start")
    a_tokens = TokenCounter("(\w+)|([\.\!\?])")
    a_census = TagCensus(text_target=a_tokens)
    a_census.parse(open(a).read())
    print(a, "done")
    #    print a,a_tokens.wordcounts

    b_tokens = TokenCounter("(\w+)|([\.\!\?])")
    b_census = TagCensus(text_target=b_tokens)
    b_census.parse(open(b).read())
    print(b, "done")
    #    print b,b_tokens.wordcounts

    for word in a_tokens.wordcounts.keys():
        if word in list(b_tokens.wordcounts.keys()):
            if a_tokens.wordcounts[word] != b_tokens.wordcounts[word]:
                print(word, a_tokens.wordcounts[word],
                      b_tokens.wordcounts[word])
Ejemplo n.º 3
0
            length = len(str(term)) ## integer
        if length > longest_term:
            longest_term = length
    return longest_term
    

if __name__ == '__main__':
    a = sys.argv[1]
    b = sys.argv[2]
    
    output = open("%s_%s_diff" % (os.path.basename(a), os.path.basename(b)), 'w')

    print "\n### Comparing %s to %s ####\n" % (a,b)
    a_text = open(a).read().decode('utf-8', 'ignore')
    a_tokens = token_counter(a_text)
    a_census = TagCensus()
    a_census.parse(a_text)
    print >> output, "TEXT 1:", a, "done parsing..."
    print >> output, len(a_tokens), "total word tokens\n"

    b_text = open(b).read().decode('utf-8', 'ignore')
    b_tokens = token_counter(b_text)
    b_census = TagCensus()
    b_census.parse(b_text)
    print >> output, "TEXT 2:", b, "done parsing..."
    print >> output, len(b_tokens), "total tokens\n"
    
    print >> output, "### Token differences ###\n"
    token_col_length = find_longest_string(set([word for word in a_tokens] + [word for word in b_tokens]))
    num_col_length = find_longest_string(set([a_tokens[i] for i in a_tokens] + [b_tokens[i] for i in b_tokens]))
    results = []