예제 #1
0
    def __init__(self, bigram_file):
        bigWordsFile = open("bigwordlist.txt", "r")

        NUMBER_OF_WORDS = 75000
        lexicon = []

        for i in range(NUMBER_OF_WORDS):
            line = bigWordsFile.readline()
            wordOnly = line.split()[0]
            wordOnly.replace(" ", "")
            if len(wordOnly) >= 1:
                lexicon.append(wordOnly)

        li = LexiconImprover(lexicon)
        self.lexImprover = li.improveLexicon()

        self.pos_bigram_combination = []
        self.bigram_dict = {}
        self.best_bigram = []
        self.max_prob = 0

        # Get the unique words
        keyset = set([])
        line_count = 0
        for line in bigram_file:
            line = line.replace("\n", "")
            line = line.replace("\t", " ")
            line = line.split(" ")
            if self.bigram_dict.has_key(line[0]):
                self.bigram_dict[line[0]].append(line)
            else:
                self.bigram_dict[line[0]] = []
                self.bigram_dict[line[0]].append(line)

            line_count += 1
예제 #2
0

NUMBER_OF_WORDS = 75000
lexicon = []

for i in range(NUMBER_OF_WORDS):
    line = bigWordsFile.readline()
    wordOnly = line.split()[0]
    wordOnly.replace(" " , "")
    if len(wordOnly) >= 1:
        lexicon.append(wordOnly)



#improve the lexicon by removing infrequent words that are relatively few letters in length
lexImprover = LexiconImprover(lexicon)
lexicon = lexImprover.improveLexicon()

allHashTags = hashTagFile.readlines()



sampleHashTagsWithAnswers = {'#30secondstomars' : ['30', 'seconds', 'to', 'mars'],
                             '#americanidol' : ['american', 'idol'],
                             '#hurricaneike' : ['hurricane', 'ike'],
                             '#celebraterandommilestones' : ['celebrate' , 'random' , 'milestones'],
                             '#entrepreneurship' : ['entrepeneurship'],
                             '#firstdayofkindergarten' : ['first', 'day' , 'of' , 'kindergarten']}


sampleHashTags = []