コード例 #1
0
def create_bigram_model(span_eng_dict):
    bigram_span_dict = collections.defaultdict(lambda: 0)
    bigram_eng_dict = collections.defaultdict(lambda: 0)
    bigram_span_eng_dict = collections.defaultdict(lambda: tuple)
    text = codecs.open('SpanishText.txt', encoding='utf-8')
    for sentence in text.readlines():
        line = [re.sub('[.?!",]', '', word) for word in sentence.split()]
        for word1, word2 in bigrams(line):
            bigram_span_dict[(word1.lower(), word2.lower())] += 1
        for word1, word2 in bigrams(line):
            #print(word1, span_eng_dict[word1])
            try:
                bigram_span_eng_dict[(word1.lower(), word2.lower())] = (
                    span_eng_dict[word1.lower()], span_eng_dict[word2.lower()])
            except:
                pass
    eng_text = open('DMT_translate.txt')
    for sentence in eng_text.readlines():
        line = [re.sub('[.?!",]', '', word) for word in sentence.split()]
        for word1, word2 in bigrams(line):
            bigram_eng_dict[(word1.lower(), word2.lower())] += 1

    text = ''
    for k, v in bigram_span_eng_dict.items():
        try:
            if (bigram_span_dict.get(k) == bigram_eng_dict.get(v)) and (
                    bigram_span_dict.get(k) >= 1):
                #print(k, v)
                text += k[0] + k[1]
        except:
            pass
    return
コード例 #2
0
def wiki_to_feature(wiki):
    """
    Specifically handles a single wiki document
    :param wiki: dict for wiki fields
    :type wiki: dict
    :return: tuple with wiki id and list of feature strings
    :rtype: tuple
    """
    try:
        features = []
        bow = []
        features += [u'ORIGINAL_HUB:%s' % wiki.get(u'hub_s', u'')]
        features += [u'TOP_CAT:%s' % u'_'.join(normalize(c)) for c in wiki.get(u'top_categories_mv_en', [])]
        bow += [u"_".join(normalize(c)) for c in wiki.get(u'top_categories_mv_en', [])]
        features += [u'TOP_ART:%s' % u"_".join(normalize(a)) for a in wiki.get(u'top_articles_mv_en', [])]
        bow += [u"_".join(normalize(a)) for a in wiki.get(u'top_articles_mv_en', [])]
        desc_ngrams = [u"_".join(n) for grouping in
                       [bigrams(normalize(np))
                       for np in TextBlob(wiki.get(u'description_txt', [u''])[0]).noun_phrases]
                       for n in grouping]
        bow += desc_ngrams
        features += [u'DESC:%s' % d for d in desc_ngrams]
        bow += [u"_".join(b) for b in bigrams(normalize(wiki[u'sitename_txt'][0]))]
        mp_nps = TextBlob(wiki.get(u'main_page_text', u'')).noun_phrases
        bow += [u"_".join(bg) for grouping in [bigrams(normalize(n)) for n in mp_nps] for bg in grouping]
        bow += [u''.join(normalize(w)) for words in [np.split(u" ") for np in mp_nps] for w in words]
        return wiki[u'id'], bow + features
    except Exception as e:
        print e, format_exc()
        raise e
コード例 #3
0
def wiki_to_feature(wiki):
    """
    Specifically handles a single wiki document
    :param wiki: dict for wiki fields
    :type wiki: dict
    :return: tuple with wiki id and list of feature strings
    :rtype: tuple
    """
    try:
        features = []
        bow = []
        features += [u'ORIGINAL_HUB:%s' % wiki.get(u'hub_s', u'')]
        features += [
            u'TOP_CAT:%s' % u'_'.join(normalize(c))
            for c in wiki.get(u'top_categories_mv_en', [])
        ]
        bow += [
            u"_".join(normalize(c))
            for c in wiki.get(u'top_categories_mv_en', [])
        ]
        features += [
            u'TOP_ART:%s' % u"_".join(normalize(a))
            for a in wiki.get(u'top_articles_mv_en', [])
        ]
        bow += [
            u"_".join(normalize(a))
            for a in wiki.get(u'top_articles_mv_en', [])
        ]
        desc_ngrams = [
            u"_".join(n) for grouping in [
                bigrams(normalize(np)) for np in TextBlob(
                    wiki.get(u'description_txt', [u''])[0]).noun_phrases
            ] for n in grouping
        ]
        bow += desc_ngrams
        features += [u'DESC:%s' % d for d in desc_ngrams]
        bow += [
            u"_".join(b) for b in bigrams(normalize(wiki[u'sitename_txt'][0]))
        ]
        mp_nps = TextBlob(wiki.get(u'main_page_text', u'')).noun_phrases
        bow += [
            u"_".join(bg)
            for grouping in [bigrams(normalize(n)) for n in mp_nps]
            for bg in grouping
        ]
        bow += [
            u''.join(normalize(w))
            for words in [np.split(u" ") for np in mp_nps] for w in words
        ]
        return wiki[u'id'], bow + features
    except Exception as e:
        print e, format_exc()
        raise e
コード例 #4
0
ファイル: features.py プロジェクト: diegocaro/opinionapp
def getFeatures(tokens, typefeat='unigrams'):

    if typefeat == 'unigrams':
        _features = FreqDist(tokens)

    elif typefeat == 'bigrams':
        _bigrams = bigrams(tokens)
        _features = FreqDist(_bigrams)

    elif typefeat == 'uni+bigrams':
        _bigrams = bigrams(tokens)
        _features = FreqDist(_bigrams + tokens)

    return _features
コード例 #5
0
def Merge_for_summary(lis):
	tweets = lis
	all_bigrams = [list(bigrams([token for token in tweets])) for tweets in nltk_tweets]
	starting_nodes = [single_bigram[0] for single_bigram in all_bigrams]
	end_nodes = [single_bigram[-1] for single_bigram in all_bigrams]
	all_bigrams = [node for single_bigram in all_bigrams for node in single_bigram]
	all_bigrams = list(set(all_bigrams))

	bigraph = make_bigram_graph(all_bigrams, starting_node)
	path = breadth_first_search(bigram_graph, starting_nodes[1], end_nodes[2])
	bigram_paths = []

	for single_start_node in tqdm(starting_nodes): 
	    bigram_graph = make_bigram_graph(all_bigrams, single_start_node)
	    for single_end_node in end_nodes:
	        possible_paths = breadth_first_search(bigram_graph, single_start_node, single_end_node)
	        for path in possible_paths: 
	            bigram_paths.append(path)

	for tweet in nltk_tweets: 
	    bigram_paths.append(list(bigrams([token for token in tweets])))
	word_paths = []
	for path in tqdm(bigram_paths): 
	    word_paths.append(make_list(path))


	begin('COWABS')
	x = var(str('x'), len(word_paths), bool)
	y = var(str('y'), len(content_vocab), bool)
	maximize(sum([linguistic_quality(word_paths[i])*informativeness(word_paths[i])*x[i] for i in range(len(x))]) + sum(y));
	sum([x[i]*len(word_paths[i]) for i in range(len(x))]) <= L;
	for j in range(len(y)):
	    sum([x[i] for i in paths_with_content_words(j)])>= y[j]

	for i in range(len(x)):
	    sum(y[j] for j in content_words(i)) >= len(content_words(i))*x[i]

	solve()
	result_x =  [value.primal for value in x]
	result_y = [value.primal for value in y]
	end()
	chosen_paths = np.nonzero(result_x)
	chosen_words = np.nonzero(result_y)
	st = ''
	for i in chosen_paths[0]:
	   st += str(" ").join([token.encode('ascii', 'ignore') for token in word_paths[i]])
	   print ('. ')
	return st
コード例 #6
0
def buildBiIndex(inverIndex):
    biIndex = {}
    print("Building Bigram Index ...")
    for token in inverIndex:
        bigramList = list(bigrams(token))
        bigramListLen = len(list(bigramList))
        for index, (firstChar, lastChar) in enumerate(bigramList):
            bigram = firstChar + lastChar

            # nltk's bigram generator does not create $m
            if index == 0:
                startBigram = "$" + firstChar
                if biIndex.get(startBigram):
                    biIndex[startBigram].append(token)
                else:
                    biIndex[startBigram] = [token]

            # nltk's bigram generator does not create m$
            elif index == bigramListLen - 1:
                endBigram = lastChar + "$"
                if biIndex.get(endBigram):
                    biIndex[endBigram].append(token)
                else:
                    biIndex[endBigram] = [token]

            if biIndex.get(bigram):
                biIndex[bigram].append(token)
            else:
                biIndex[bigram] = [token]

    print("Finished building Bigram Index")
    return biIndex
コード例 #7
0
ファイル: pos-tagger.py プロジェクト: lycarter/863-cgw
def pos_tags(vocab_hash, sentence):
    sentence = sentence.split()
    unigram_hash = get_pos(vocab_hash, sentence)
    bigram_hash = get_pos(vocab_hash, bigrams(sentence))
    trigram_hash = get_pos(vocab_hash, trigrams(sentence))
    pos_tags = []
    ngram_to_tag = {}
    ngram_ordering = []
    for i in xrange(len(sentence)):
        word = sentence[i]
        if unigram_hash.has_key(word):
            tag = unigram_hash[word]
            pos_tags.append(tag)
            ngram_to_tag[word] = tag
            ngram_ordering.append(word)
        elif i < len(sentence) - 1:
            bigram = sentence[i] + " " + sentence[i + 1]
            if bigram_hash.has_key(bigram):
                tag = bigram_hash[bigram]
                pos_tags.append(tag)
                ngram_to_tag[bigram] = tag
                ngram_ordering.append(bigram)
                i += 1
            elif (i < len(sentence) - 2):
                trigram = " ".join(sentence[i:i + 2])
                if trigram_hash.has_key(trigram):
                    tag = trigram_hash[trigram]
                    pos_tags.append(tag)
                    ngram_to_tag[trigram] = tag
                    ngram_ordering.append(trigram)
                    i += 2
    return pos_tags, ngram_to_tag, ngram_ordering
def gender_feature(text, feature_vect):
    """
    Extract the gender features
    :param text:
    :param feature_vect: contains a bag of words and a list of bigrams
    :return: a dictionary which contains the feature and its computed value
    """
    #sentence length and vocab features
    tokens = word_tokenize(text.lower())
    sentences = sent_tokenize(text.lower())
    words_per_sent = np.asarray([len(word_tokenize(s)) for s in sentences])

    #bag_of_word features
    bag_dict = {}
    for bag in feature_vect[:29]:
        bag_dict[bag] = bag in tokens

    #bigrams features
    bigram_dict = {}
    for big in feature_vect[29:]:
        bigram_dict[big] = big in bigrams(tokens)

    #POS tagging features
    POS_tag = ['ADJ', 'ADV', 'DET', 'NOUN', 'PRT', 'VERB', '.']
    tagged_word = parse(text, chunks=False, tagset='UNIVERSAL').split()
    simplified_tagged_word = [(tag[0], map_tag('en-ptb', 'universal', tag[1]))
                              for s in tagged_word for tag in s]
    freq_POS = nltk.FreqDist(tag[1] for tag in simplified_tagged_word
                             if tag[1] in POS_tag)

    d = dict({'sentence_length_variation': words_per_sent.std()}, **bag_dict)

    return dict(dict(d, **bigram_dict), **freq_POS)
コード例 #9
0
 def get_bigram_tags(self) -> Iterator[Tuple[str, str]]:
     "first concatenate all POS tags in all sentences in bijenkhan corpus"
     seq_tags = []
     gen = self.sent_tag_gen(100)
     for _, tags in gen:
         [[seq_tags.append(tag) for tag in tag_seq] for tag_seq in tags]
     return list(bigrams(seq_tags))
コード例 #10
0
ファイル: rake_stem.py プロジェクト: neethukurian/keyextract
def generate_unibitrigrams(key_score_file):
    with open(key_score_file, 'rb') as infile:
        infile.readline()
        key_list = list()
        for line in infile:
            row = list(line.split(','))
            key_list.append(row[0])
    uni_bi_trigrams = []
    for phrase in key_list:
        words = []
        unigrams_ls = []
        bigrams_ls = []
        trigrams_ls = []
        for word in nltk.word_tokenize(phrase):
            word = re.sub('[!"#$%&\'\(\)*+,-./:;<=>?@[\]\^_`{|}~]', '', word)
            words.append(word)
        unigrams_ls = words
        #bigrams_ls=list(bigrams(words))

        for x in list(bigrams(words)):
            bigrams_ls.append(x[0] + ' ' + x[1])

        for x in list(trigrams(words)):
            trigrams_ls.append(x[0] + ' ' + x[1] + ' ' + x[2])
        #trigrams_ls=list(trigrams(words))
        uni_bi_trigrams = uni_bi_trigrams + unigrams_ls + bigrams_ls + trigrams_ls
    return uni_bi_trigrams
コード例 #11
0
    def act(self):
        """
        Add words in the last observation to the dictionary.

        This checks any fields in the message present in the --dict-textfields
        argument (e.g. "text,labels").
        """
        for textfield in self.textfields:
            source = self.observation.get(textfield)
            if source is None:
                continue
            # fields may be singleton strings or lists of strings.
            # wrap the singleton strings in a list to iterate over them
            if type(source) is str:
                source = [source]
            for text in source:
                if text:
                    tokens = self.tokenize(text)
                    self.add_to_dict(tokens)
                    unigram_ = nltk.ngrams(tokens, 1)
                    bigrams_ = bigrams(tokens)
                    trigrams_ = trigrams(tokens)
                    self.unigram_freq.update(unigram_)
                    self.bigram_freq.update(bigrams_)
                    self.trigram_freq.update(trigrams_)
        return {'id': 'Dictionary'}
コード例 #12
0
def autocorrect_query(query,df,cutoff=0.8,warning_on=True):
    """
    autocorrect a query based on the training set
    """	
    train_data = df.values[df['search_term'].values==query,:]
    s = ""
    for r in train_data:
        w = r
        s = "%s %s %s"%(s,BeautifulSoup(r[1]).get_text(" ",strip=True),BeautifulSoup(r[2]).get_text(" ",strip=True))
    s = re.findall(r'[\'\"\w]+',s.lower())
    s_bigram = [' '.join(i) for i in bigrams(s)]
    s.extend(s_bigram)
    corrected_query = []	
    for q in query.lower().split():
        if len(q)<=2:
            corrected_query.append(q)
            continue
        if bool(re.search('\d', q)): # skip if it is word with number, like 4.5in_
            corrected_query.append(q)
            continue
        corrected_word = difflib.get_close_matches(q, s,n=1,cutoff=cutoff)
        if len(corrected_word) >0:
            corrected_query.append(corrected_word[0])
        else :
            if warning_on:
                print("WARNING: cannot find matched word for '%s' -> used the original word"%(q))
            corrected_query.append(q)	
    return ' '.join(corrected_query)
コード例 #13
0
 def list_followers(self, word):
     followers = set()
     for tup in list(bigrams(flattened)):
         if tup[0] == word:
             followers.add(tup[1])
     print(followers)
     return followers
コード例 #14
0
 def aggregate_topics_of_segmented_reports(self, cut_of_segmented_reports, topics):
     aggregated_topics = []
     bigrams_of_topics = bigrams(map(lambda x: [x.decode('utf-8')], topics))
     for i in range(len(bigrams_of_topics)):
         for j in range(len(cut_of_segmented_reports)):
             aggregated_topics.extend(cut_of_segmented_reports[j][cut_of_segmented_reports[j].index(bigrams_of_topics[i][0]):cut_of_segmented_reports[j].index(bigrams_of_topics[i][1])])
     return aggregated_topics
コード例 #15
0
def gender_feature(text, feature_vect):
    """
    Extract the gender features
    :param text:
    :param feature_vect: contains a bag of words and a list of bigrams
    :return: a dictionary which contains the feature and its computed value
    """
    #sentence length and vocab features
    tokens = word_tokenize(text.lower())
    sentences = sent_tokenize(text.lower())
    words_per_sent = np.asarray([len(word_tokenize(s)) for s in sentences])

    #bag_of_word features
    bag_dict = {}
    for bag in feature_vect[:29]:
        bag_dict[bag] = bag in tokens

    #bigrams features
    bigram_dict = {}
    for big in feature_vect[29:]:
        bigram_dict[big] = big in bigrams(tokens)

    #POS tagging features
    POS_tag = ['ADJ', 'ADV', 'DET', 'NOUN', 'PRT', 'VERB', '.']
    tagged_word = parse(text, chunks=False, tagset='UNIVERSAL').split()
    simplified_tagged_word = [(tag[0], map_tag('en-ptb', 'universal', tag[1])) for s in tagged_word for tag in s]
    freq_POS = nltk.FreqDist(tag[1] for tag in simplified_tagged_word if tag[1] in POS_tag)

    d = dict({'sentence_length_variation': words_per_sent.std()}, **bag_dict)

    return dict(dict(d, **bigram_dict), **freq_POS)
コード例 #16
0
def n_grams(word):
    bigrams_text = []
    with open(r"C:\PyCharmGrammarly\Grammarly\lyrics.txt", 'r') as tok:
        raw_text = tok.read()
        raw_text = raw_text.split("\n")
        lines = []
        for i in raw_text:
            tokens = word_tokenize(i)
            lines.append(tokens)
        for l in lines:
            b = list(bigrams(l))
            bigrams_text.extend(b)
        k = 0
        song = []
        selection = []
        while k <= 96:
            k = k + 1
            for first in bigrams_text:
                if word == first[0]:
                    selection.append(first)
            freq_bi = FreqDist(selection)
            best = freq_bi.most_common()
            word = random.choice(best)
            word = word[0][1]
            song.append(word)
        words_in_line = 0
        for lyric in song:
            words_in_line += 1
            if words_in_line % 5 == 0:
                print(lyric, "\n")
                continue
            else:
                print(lyric, end=" ")
コード例 #17
0
ファイル: count_bigrams.py プロジェクト: TheNewDaysDawn/ibmdw
def count_bigrams(input_fp, frequencies, buffer_size=1024):
    '''Read the text content of a file and keep a running count of how often
    each bigram (sequence of two) characters appears.

    Arguments:
        input_fp -- file pointer with input text
        frequencies -- mapping from each bigram to its counted frequency
        buffer_size -- incremental quantity of text to be read at a time,
            in bytes (1024 if not otherwise specified)

    Returns:
        nothing
    '''
    #Read the first chunk of text, and set all letters to lowercase
    text = input_fp.read(buffer_size).lower()
    #Loop over the file while there is text to read
    while text:
        spans = TOKENIZER.span_tokenize(text)
        tokens = (text[begin : end] for (begin, end) in spans)
        for bigram in bigrams(tokens):
            #Increment the count for the bigram. Automatically handles any
            #bigram not seen before. The join expression turns 2 separate 
            #single-character strings into one 2-character string
            frequencies[''.join(bigram)] += 1
        #Read the next chunk of text, and set all letters to lowercase
        text = input_fp.read(buffer_size).lower()

    return
コード例 #18
0
def sentence_preprocessing(sentence):
    
    sentence_clean = []
    
    # Remove HTML entities
    sentence = BeautifulSoup(sentence, "lxml").get_text()
     
    # Tokenize
    tokens = preprocess(sentence, lowercase=False)
    
    # Step 1: Remove all useless things
    tokens = [tk for tk in tokens if tk not in punc] # Punctuation
    #tokens = [tk for tk in tokens if tk not in stop] # Stopwords
    tokens = [tk for tk in tokens if re.match(link_pattern, tk) == None] # Link
    tokens = [tk for tk in tokens if re.match(html_entities_pattern, tk) == None] # HTML entities
    tokens = [tk for tk in tokens if tk.lower() not in remove_terms] # Some special terms to remove
    
    # Step 2: Add some lower/upper case
    tokens_lower = [tk.lower() for tk in tokens] # Lowercase
    tokens_upper = [tk.upper() for tk in tokens] # Uppercase
    tokens_title = [tk.title() for tk in tokens] # Title
    tokens = tokens + tokens_lower + tokens_upper + tokens_title
    
    # Step 3: Add bigram
    bigrams_words = ['_'.join(w) for w in list(bigrams(tokens))]
    sentence_clean = ' '.join(tokens) + ' ' + ' '.join(bigrams_words)
    
    return sentence_clean
コード例 #19
0
def count_bigrams(input_fp, frequencies, buffer_size=1024):
    '''Read the text content of a file and keep a running count of how often
    each bigram (sequence of two) characters appears.

    Arguments:
        input_fp -- file pointer with input text
        frequencies -- mapping from each bigram to its counted frequency
        buffer_size -- incremental quantity of text to be read at a time,
            in bytes (1024 if not otherwise specified)

    Returns:
        nothing
    '''
    #Read the first chunk of text, and set all letters to lowercase
    text = input_fp.read(buffer_size).lower()
    #Loop over the file while there is text to read
    while text:
        spans = TOKENIZER.span_tokenize(text)
        tokens = (text[begin:end] for (begin, end) in spans)
        for bigram in bigrams(tokens):
            #Accommodate the bigram if seen for the first time
            frequencies.setdefault(bigram, 0)
            #Increment the count for the bigram
            frequencies[bigram] += 1
        #Read the next chunk of text, and set all letters to lowercase
        text = input_fp.read(buffer_size).lower()

    return
コード例 #20
0
def count_bigrams(frequencies):
    '''Read the text content of a file and keep a running count of how often
    each bigram (sequence of two) characters appears.

    Arguments:
        frequencies ‑‑ mapping from each bigram to its counted frequency
        
    Returns:
        nothing
    '''

    # Read in all the text from the file and set all letters to lowercase
    with open(sys.argv[1], "r") as f:
        text = f.read().lower()

    # This step is needed to collapse runs of space characters into one
    text = ' '.join(text.split())
    """
    spans = TOKENIZER.span_tokenize(text)
    tokens = (text[begin : end] for (begin, end) in spans)
    """

    tokens = TOKENIZER.tokenize(text)
    for bigram in bigrams(tokens):
        # Increment the count for the bigram. Automatically handles any
        # bigram not seen before. The join expression turns 2 separate
        # single‑character strings into one 2‑character string
        frequencies[''.join(bigram)] += 1

    return
コード例 #21
0
ファイル: nltk.py プロジェクト: thePortus/dhelp
    def ngrams(self, gram_size=3):
        """Gives ngrams.

        Returns a list of ngrams, each ngram represented as a tuple.

        Args:
            gram_size (:obj:`int`, optional) Size of the ngrams to generate

        Returns:
            :obj:`list` of :obj:`tuple` Words of each ngram

        Example:
            >>> text = EnglishText('They hated to think of sample sentences.')
            >>> basic_ngrams = text.ngrams()
            >>> print(basic_ngrams)
            [('They', 'hated', 'to'), ('hated', 'to', 'think'), ('to', 'think', 'of'), ('think', 'of', 'sample'), ('of', 'sample', 'sentences'), ('sample', 'sentences', '.')]
        """ # noqa
        tokens = self.tokenize()
        if gram_size < 2:  # pragma: no cover
            gram_size = 2
        if gram_size == 2:  # pragma: no cover
            return list(bigrams(tokens))
        if gram_size == 3:
            return list(trigrams(tokens))
        else:  # pragma: no cover
            return list(ngrams(tokens, gram_size))
コード例 #22
0
ファイル: core.py プロジェクト: PyWilhelm/FoLT2014
 def generate_ds(self, words):
     learning_info_dict = {lang: {w: float(t) 
                           for w, t in self._language_model_cfd[lang].most_common()} 
                           for lang in self._language_model_cfd.keys()}
     testing_info_dict = {w: float(t) 
                          for w, t in FreqDist([tpl for word in words for tpl in bigrams(word)]).most_common()}
     return learning_info_dict, testing_info_dict
コード例 #23
0
ファイル: core.py プロジェクト: PyWilhelm/FoLT2014
 def generate_ds(self, words):
     learning_info_dict = {lang: {w: float(t) 
                           for w, t in self._language_model_cfd[lang].most_common()} 
                           for lang in self._language_model_cfd.keys()}
     testing_info_dict = {w: float(t) 
                          for w, t in FreqDist(bigrams([w.lower() for w in words])).most_common()}
     return learning_info_dict, testing_info_dict
コード例 #24
0
 def organize_aggregated_topics_by_dict(self, aggregated_topics, topics):
     aggregated_topics.append([None])
     topics.pop()
     topics.append(None)
     modified_topics = map(lambda x: [x.decode('utf-8')], topics[0:-1])
     modified_topics.append([None])
     modified_bigrams_of_topics = bigrams(modified_topics)
     aggregated_list_of_tuple = []
     for i in range(len(modified_bigrams_of_topics)):
         aggregated_list_of_tuple.append(
             tuple([
                 aggregated_topics[aggregated_topics.index(
                     modified_bigrams_of_topics[i][0])][0].encode('utf-8'),
                 aggregated_topics[aggregated_topics.index(
                     modified_bigrams_of_topics[i][0]):aggregated_topics.
                                   index(modified_bigrams_of_topics[i][1])]
             ]))
     dict_of_sentences_by_topic = dict(aggregated_list_of_tuple)
     for k, v in dict_of_sentences_by_topic.items():
         dict_of_sentences_by_topic[k] = [
             s for s in dict_of_sentences_by_topic[k]
             if s != [k.decode('utf-8')]
         ]
     for k, v in dict_of_sentences_by_topic.items():
         for i in range(len(dict_of_sentences_by_topic[k])):
             dict_of_sentences_by_topic[k][i] = map(
                 lambda w: w.lower(), dict_of_sentences_by_topic[k][i])
     return dict_of_sentences_by_topic
コード例 #25
0
def autocorrect_query(query, df, cutoff=0.8, warning_on=True):
    """
    autocorrect a query based on the training set
    """
    train_data = df.values[df['search_term'].values == query, :]
    s = ""
    for r in train_data:
        w = r
        s = "%s %s %s" % (s, BeautifulSoup(r[1]).get_text(
            " ", strip=True), BeautifulSoup(r[2]).get_text(" ", strip=True))
    s = re.findall(r'[\'\"\w]+', s.lower())
    s_bigram = [' '.join(i) for i in bigrams(s)]
    s.extend(s_bigram)
    corrected_query = []
    for q in query.lower().split():
        if len(q) <= 2:
            corrected_query.append(q)
            continue
        if bool(re.search('\d',
                          q)):  # skip if it is word with number, like 4.5in_
            corrected_query.append(q)
            continue
        corrected_word = difflib.get_close_matches(q, s, n=1, cutoff=cutoff)
        if len(corrected_word) > 0:
            corrected_query.append(corrected_word[0])
        else:
            if warning_on:
                print(
                    "WARNING: cannot find matched word for '%s' -> used the original word"
                    % (q))
            corrected_query.append(q)
    return ' '.join(corrected_query)
コード例 #26
0
def get_bigram(text_list):
	# text_list is a list of strings
	new_list = []
	for i in range(len(text_list)):
		new_list.append(list(bigrams(text_list[i])))

	return new_list
コード例 #27
0
    def _get_filtered_bigrams(self, words):
        filtered_bigrams = []

        for bi in bigrams(words):
            if not any(w for w in bi if w in stopwords) and bi[0] != bi[1]:
                filtered_bigrams.append(bi)
        return filtered_bigrams
コード例 #28
0
def sentence_preprocessing(sentence):
    
    sentence_clean = []
    
    # Remove HTML entities
    sentence = BeautifulSoup(sentence, "lxml").get_text()
     
    # Tokenize
    tokens = preprocess(sentence, lowercase=False)
    
    # Step 1: Remove all useless things
    tokens = [tk for tk in tokens if tk not in punc] # Punctuation
    tokens = [tk for tk in tokens if tk.lower() not in stop] # Stopwords
    tokens = [tk for tk in tokens if re.match(link_pattern, tk) == None] # Link
    tokens = [tk for tk in tokens if emoticon_re.search(tk) == None] # Emoticons
    #tokens = [tk for tk in tokens if re.match(html_entities_pattern, tk) == None] # HTML entities
    tokens = [tk for tk in tokens if tk.lower() not in remove_terms] # Some special terms to remove
      
    # Step 2: Remove short words and non char
    tokens = [tk for tk in tokens if len(tk) >= 3] # Remove short words
    tokens = [tk for tk in tokens if re.match(number_pattern, tk) == None] # Remove number

    # Step 3: Add bigram
    tokens = [tk.lower() for tk in tokens] # Lowercase
    bigrams_words = ['_'.join(w) for w in list(bigrams(tokens))]
    sentence_clean = ' '.join(tokens) + ' ' + ' '.join(bigrams_words)
    
    return sentence_clean
コード例 #29
0
ファイル: rake_stem.py プロジェクト: neethukurian/keyextract
def generate_unibitrigrams(key_score_file):
    with open(key_score_file,'rb') as infile:
        infile.readline()
        key_list=list()
        for line in infile:
            row=list(line.split(','))
            key_list.append(row[0])
    uni_bi_trigrams=[]
    for phrase in key_list:
        words=[]
        unigrams_ls=[]
        bigrams_ls=[]
        trigrams_ls=[]
        for word in nltk.word_tokenize(phrase):
            word=re.sub('[!"#$%&\'\(\)*+,-./:;<=>?@[\]\^_`{|}~]','',word)
            words.append(word)
        unigrams_ls=words
        #bigrams_ls=list(bigrams(words))

        for x in list(bigrams(words)):
            bigrams_ls.append(x[0]+' '+x[1] )


        for x in list(trigrams(words)):
            trigrams_ls.append(x[0]+' '+x[1]+' '+x[2] )
        #trigrams_ls=list(trigrams(words))
        uni_bi_trigrams=uni_bi_trigrams+unigrams_ls+bigrams_ls+trigrams_ls
    return uni_bi_trigrams
コード例 #30
0
def BigramAll():
    to_save_folder = "./#Bigram[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1 :
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name+"data.doc";
        fw = open(data_path,"r",encoding="utf8");
        text = fw.read();
        words = word_tokenize(text);

        big = list(bigrams(w for w in words if len(w) > 1 and w != "``"));
        myBig = []
        for bi in big:
            myBig.append(bi[0]+" "+bi[1]);

        fdist = FreqDist(str(w) for w in myBig);

        keys = fdist.most_common(len(fdist.keys()))
        dataFreq = "";
        for key in keys:
            dataFreq+= str(key[0]).strip()+","+str(key[1]).strip()+"\n";

        make_sure_path_exists(to_save_folder+folder)
        writer = open(to_save_folder+folder+"/"+folder+"[bigram_Freq].csv","w+",encoding="utf8");
        writer.write(dataFreq);
        fw.close();
        writer.close();
コード例 #31
0
ファイル: theme.py プロジェクト: palcu/ckanext-dgu
def score_by_topic(pkg, scores):
    '''Examines the pkg and adds scores according to topics in it.'''
    themes = Themes.instance()
    for level in range(3):
        pkg_text = package_text(pkg, level)
        words, words_without_stopwords = normalize_text(pkg_text)
        for num_words in (1, 2, 3):
            if num_words == 1:
                ngrams = words_without_stopwords
                topic_ngrams = themes.topic_words
                topic_ngrams_set = themes.topic_words_set
            elif num_words == 2:
                ngrams = bigrams(words)
                topic_ngrams = themes.topic_bigrams
                topic_ngrams_set = themes.topic_bigrams_set
            elif num_words == 3:
                ngrams = trigrams(words)
                topic_ngrams = themes.topic_trigrams
                topic_ngrams_set = themes.topic_trigrams_set
            matching_ngrams = set(ngrams) & topic_ngrams_set
            if matching_ngrams:
                for ngram in matching_ngrams:
                    occurrences = ngrams.count(ngram)
                    score = (3-level) * occurrences * num_words
                    theme = topic_ngrams[ngram]
                    ngram_printable = ' '.join(ngram) if isinstance(ngram, tuple) else ngram
                    reason = '"%s" matched %s' % (ngram_printable, LEVELS[level])
                    if occurrences > 1:
                        reason += ' (%s times)' % occurrences
                    scores[theme].append((score, reason))
                    log.debug(' %s %s %s', theme, score, reason)
コード例 #32
0
def markov_model_classify(info, sentence):
    # TODO
    prob = [
        math.log(x / info.total_examples, math.e)
        for x in info.sentiment_counts
    ]
    tokens = tokenize(sentence)
    my_bigrams = bigrams(tokens)
    token = tokens[0]
    for i in range(5):
        pToken = info.word_counts[i].get(token, 0) / info.total_words[i]
        if pToken == 0:
            prob[i] = prob[i] + math.log(OUT_OF_VOCAB_PROB, math.e)
        else:
            prob[i] = prob[i] + math.log(pToken, math.e)

    for bigram in my_bigrams:
        for i in range(5):
            counts = info.bigram_counts[i].get(bigram, 0)
            denoms = info.bigram_denoms[i].get(bigram[0], 0)
            if counts != 0:
                pBigram = counts / denoms
            else:
                pBigram = OUT_OF_VOCAB_PROB

            prob[i] = prob[i] + math.log(pBigram, math.e)

    return prob.index(max(prob)), max(prob)
コード例 #33
0
def read_file(fileList):
    
    global G
    G = nx.Graph()

    # Internal mapping
    region2column = {1:1,
                     2:4,
                     3:7,
                     4:10}

    for fn in fileList:
        wb = open_workbook(filename=fn)
        sheet = wb.sheet_by_name(args['sheetName'])
        columnNumber = region2column[args['partOfCountry']]
        for i in range(args['startNumber'],
                       min(sheet.nrows,args['maxNrNames']+(args['startNumber']*len(fileList)))/len(fileList)):
            name = sheet.cell_value(i,columnNumber)
            freq = int(sheet.cell_value(i,columnNumber+1))
            rank = int(sheet.cell_value(i,columnNumber-1))
            # Give importance to first letter
            charBigrams = bigrams('_%s' % name)

            if not G.has_node(name):
                G.add_node(name, {'type': 'firstname', 'freq': freq, 'rank': rank, 'size': int(log(freq))*2})
                
            for cb in charBigrams:
                if not G.has_node(cb):
                    G.add_node(cb, {'type': 'charbigram'})
                if not G.has_edge(name, cb):
                    G.add_edge(name,cb)
コード例 #34
0
def markov_model_classify(info, sentence):
    tokens = tokenize(sentence)
    #print("tokens: ", tokens)
    total_bigrams = list(bigrams(tokens))
    #print("my_bigrams: ", total_bigrams)
    prob_log_mm = -10000
    #print("11111111")
    #print("prob_log_mm: ", prob_log_mm)
    class_chosen_mm = 0

    for sen in range(CLASSES):
        class_prob = float(info.sentiment_counts[sen] / info.total_examples)
        #print("class_prob_mm: ", class_prob)
        sen_word_counts = info.word_counts[sen]
        #print("sen_word_counts_mm: ", sen_word_counts)
        sen_word_total = info.total_words[sen]
        #print("sen_word_total_mm: ", sen_word_total)
        sen_word_bigram_counts = info.bigram_counts[sen]
        #print("sen_word_bigram_counts_mm: ", sen_word_bigram_counts)
        sen_word_denom = info.bigram_denoms[sen]
        #print("sen_word_denom_mm: ", sen_word_denom)
        con_prob = 0

        for token in tokens:
            if sen_word_counts.get(token) == None:
                con_prob += math.log(OUT_OF_VOCAB_PROB)
                break
            else:
                token_con_prob = float(
                    sen_word_counts.get(token) / sen_word_total)
                #print("token_con_prob_mm: ", token_con_prob)
                con_prob += math.log(token_con_prob)
                #print("con_prob: ", con_prob)
                break

        for bigram in total_bigrams:
            #print("bigram: ", bigram)
            #bigram_con_prob = float(sen_word_bigram_counts.get(bigram, OUT_OF_VOCAB_PROB) / sen_word_denom.get(bigram[0], 1))
            if sen_word_bigram_counts.get(bigram) == None:
                con_prob += math.log(OUT_OF_VOCAB_PROB)
            else:
                bigram_con_prob = float(
                    sen_word_bigram_counts.get(bigram) /
                    sen_word_denom.get(bigram[0]))
                #print("bigram_con_prob: ", bigram_con_prob)
                con_prob += math.log(bigram_con_prob)
                #print("con_prob: ", con_prob)

        temp_mm = math.log(class_prob) + con_prob
        #print("temp_mm: ", temp_mm)

        if temp_mm > prob_log_mm:
            prob_log_mm = temp_mm
            class_chosen_mm = sen

        #print("prob_mm: ", prob_log_mm)
        #print("class_chosen_mm: ", class_chosen_mm)

    return class_chosen_mm, prob_log_mm
コード例 #35
0
    def perplexity(self, sentence, method):
        """
        Compute the perplexity of a sentence given a estimation method

        No modify
        """
        return 2.0 ** (-1.0 * mean([method(context, word) for context, word in \
                                    bigrams(self.tokenize_and_censor(sentence))]))
コード例 #36
0
ファイル: language_model.py プロジェクト: sangheestyle/cl1-hw
    def perplexity(self, sentence, method):
        """
        Compute the perplexity of a sentence given a estimation method

        You do not need to modify this code.
        """
        return 2.0 ** (-1.0 * mean([method(context, word) for context, word in \
                                    bigrams(self.tokenize_and_censor(sentence))]))
コード例 #37
0
 def get_ngram_tokens(self, line):
     tokens = nltk.wordpunct_tokenize(line)
     message = [self.stemmer.stem(x) for x in tokens if len(x) > 2 and x not in self.stops]
     bigram = bigrams(message)
     for pair in bigram:
         joined = " ".join(pair)
         message.append(joined)
     return list(set(message))
コード例 #38
0
ファイル: ngram_utilities.py プロジェクト: manniche/nlangp
def bigram_format( test_corpus ):
    """
    >>> bigram_format(["the dog runs STOP", "the cat walks STOP", "the dog runs STOP"])
    [[('the', 'dog'), ('dog', 'runs'), ('runs', 'STOP')], [('the', 'cat'), ('cat', 'walks'), ('walks', 'STOP')], [('the', 'dog'), ('dog', 'runs'), ('runs', 'STOP')]]
    """

    wl = [ [word for word in sentence.split()] for sentence in test_corpus] 
    return [ util.bigrams( l ) for l in wl ]
コード例 #39
0
def bigramsPhi(comment):
    """The basis for a bigrams feature function.
    """
    sent = [stemmer.stem(tok) for tok in comment.split()] # Stemming + punc
    unis = Counter()
    sent = ["<<START>>"] + sent + ["<<END>>"]
    unis.update(bigrams(sent))                             # Bigrams
    return unis
コード例 #40
0
 def sentProbaility(self,sent,smooth_const):
     V = 217847
     tool = MyToolKit()
     bigrs = bigrams(tool.words(sent));
     p = 1
     for tuple in bigrs:
         p = math.exp(math.log(p)+math.log(self.LaplaceSmoothing(tuple[1],tuple[0],smooth_const,V)))
         #p = math.exp(math.log(p)+math.log(self.AbsoluteDiscountingSmoothing(tuple[1],tuple[0],smooth_const,V)))
     return p
コード例 #41
0
def getAllBigramInfo(data_df, data_coln, target_coln):

    all_bigrams = []
    for doc in data_df[data_coln]:
        doc_bigrams = list(bigrams(doc))
        all_bigrams.append(doc_bigrams)

    data_df['Bigrams'] = all_bigrams
    all_bigrams_flattened = flatten(all_bigrams)
    bigram_vocab = list(set(all_bigrams_flattened))
    vocab_df = pd.DataFrame()
    vocab_df['Bigrams'] = bigram_vocab

    data_df_1 = data_df[data_coln][data_df[target_coln] == 1]

    count_matrix_columns = data_df.index
    count_matrix = pd.DataFrame(0,
                                columns=count_matrix_columns,
                                index=bigram_vocab)

    data_df_columns = list(data_df.columns)
    del_columns = [x for x in data_df_columns if x != 'Bigrams']
    data_df.drop(del_columns, axis=1, inplace=True)

    for doc_id in count_matrix_columns:
        bigram_counter = dict(Counter(data_df['Bigrams'][doc_id]))
        for word in bigram_counter:
            count_matrix[doc_id][word] = bigram_counter[word]

    total_1_docs = len(data_df_1)
    all_1_docs = set(data_df_1.index)
    precision = []
    recall = []
    f1_score = []

    for word in bigram_vocab:
        t = count_matrix.loc[[word]].transpose()
        t1 = pd.Series(t[word])
        phrase_nonzero = set(t1.nonzero()[0])
        r1_with_phrase = phrase_nonzero.intersection(all_1_docs)
        precision_of_doc = len(r1_with_phrase) / len(phrase_nonzero)
        recall_of_doc = len(r1_with_phrase) / total_1_docs
        try:
            f1_score_of_doc = (2 * precision_of_doc * recall_of_doc) / (
                2 * (precision_of_doc + recall_of_doc))
        except:
            f1_score_of_doc = -1

        precision.append(precision_of_doc)
        recall.append(recall_of_doc)
        f1_score.append(f1_score_of_doc)

    vocab_df['Precision'] = precision
    vocab_df['Recall'] = recall
    vocab_df['F1_score'] = f1_score

    return vocab_df
コード例 #42
0
def crear_bigramas(texto):
  #tokenizando texto
  #texto_tokenizado = limpiar_texto(texto)
  #creando bigramas
  bigramas = list(bigrams(texto))
  #filtrando de  bigramas del texto sin tomar en cuenta caracteres especiales
  threshold = 1 # Con esto se eliminan signos de puntuación y caracteres especiales/
  bigramas_filtrados = [bigram for bigram in bigramas if len(bigram[0])>=threshold and len(bigram[1])>=threshold]
  return bigramas_filtrados
コード例 #43
0
ファイル: feature_extraction.py プロジェクト: mhaas/ma-thesis
 def handleGrams(self, tokenList):
     res = []
     if self.unigrams:
         res.extend(tokenList)
     if self.bigrams:
         res.extend(bigrams(tokenList))
     if self.gappyBigrams:
         res.extend(self.gappy_bigrams(tokenList))
     return res
コード例 #44
0
ファイル: build.py プロジェクト: slacy/linky
    def process(self, filename):
        """process"""
        in_file = open(filename)
        self.content[filename] = in_file.read()
        in_file.close()

        words = self.content[filename].split(' ')
        grams = bigrams(words)
        self.add_grams(filename, grams)
コード例 #45
0
ファイル: language_model.py プロジェクト: jvieitez/cl1-hw
    def add_train(self, sentence):
        """
        Add the counts associated with a sentence.
        """

        # You'll need to complete this function, but here's a line of code that
        # will hopefully get you started.
        for context, word in bigrams(self.tokenize_and_censor(sentence)):
            None
コード例 #46
0
def brown_bigrams(category):
    """Takes as input the name of a brown category, and returns a list of all of the bigrams in the category."""
    words = ["<s>"]
    words += [
        word.lower() for word in brown.words(categories=category)
        if word.isalnum()
    ]
    words.append("</s>")
    return list(bigrams(words))
コード例 #47
0
ファイル: app.py プロジェクト: rlatks30/deepbot-tutorial
def check_intent(message):
    bigram = bigrams(message)

    bigram_tokens = []
    for item in bigram:
        bigram_tokens.append(''.join(item))

    if( "검색" in bigram_tokens):
        app.INTENT_STATUS = 1
コード例 #48
0
ファイル: word_feature.py プロジェクト: artir/cl2_project
 def get_feature_by_all_bigrams(self, bgs):
     bg_counts = list()
     for statuses in self._author_statuses:
         count = 0
         for status in statuses:
             for bg in bigrams(status):
                 if bg in bgs:
                     count += 1
         bg_counts.append(count)
     return bg_counts
コード例 #49
0
def classify_paras(paras, classifier):
    d = collections.defaultdict(list)

    for para in paras:
        words = [w.lower() for w in itertools.chain(*para)]
        feats = dict([(w, True) for w in words + bigrams(words)])
        label = classifier.classify(feats)
        d[label].append(" ".join(words))

    return d
コード例 #50
0
ファイル: my_nlp.py プロジェクト: Aravindpogu/NLP
def get_bigrams_frequency_dist(tokens):
    bigram_freq_dist = {}
    list_for_ngrams = get_list_for_ngrams(tokens)

    bigram_list = list(bigrams(list_for_ngrams))
    for bigram_tuple in bigram_list:
        if bigram_freq_dist.has_key(bigram_tuple):
            bigram_freq_dist[bigram_tuple] += 1
        else:
            bigram_freq_dist[bigram_tuple] = 1
    return bigram_freq_dist
コード例 #51
0
ファイル: my_nlp.py プロジェクト: Aravindpogu/NLP
def get_ngrams_frequency_dist(tokens):
    ngram_freq_dist = {}
    list_for_ngrams = get_list_for_ngrams(tokens)

    ngram_list = list(bigrams(list_for_ngrams)) + list(ngrams(list_for_ngrams, 3)) + list(ngrams(list_for_ngrams, 4))
    for ngram in ngram_list:
        if ngram_freq_dist.has_key(ngram):
            ngram_freq_dist[ngram] += 1
        else:
            ngram_freq_dist[ngram] = 1
    return ngram_freq_dist
コード例 #52
0
 def _order_tags_by_sent(self):
     self.tokenized_content = self._tokenize_content()
     tags_into_tokenized_content = []
     bigrams_of_tags_by_sent = []
     ordered_tags_by_sent = []
     for sent in self.tokenized_content:
         tags_into_tokenized_content.append([tag for tag in sent if tag in TAGS])
     for tags_by_sent in tags_into_tokenized_content:
         bigrams_of_tags_by_sent.append(bigrams(tags_by_sent))
         ordered_tags_by_sent.append(list(OrderedSet(tags_by_sent)))
     return ordered_tags_by_sent
コード例 #53
0
ファイル: document.py プロジェクト: Bankq/CS6998
 def tokenize(self, sentence, do_stopwords, do_stemming,use_bigrams):
         words = word_tokenize(sentence)
         words = [w.lower() for w in words if len(w) > 2]
         if do_stopwords:
                 words = [w for w in words if w not in stop_set]
         if do_stemming:
                 stemmer = PorterStemmer()
                 words = [stemmer.stem(w) for w in words]
         if use_bigrams:
                 words = bigrams(words)
         return words
コード例 #54
0
    def is_gift_card_page(self, tokenList):
        # TODO This is duplicative - you need to remove this function

        lower_case_tokens = [self.make_lower_case_without_punctuation(w) for w in tokenList]

        bigramList = util.bigrams(lower_case_tokens)

        # this needs to be refined to pull out any punctuation

        if ("gift", "card") in bigramList:
            return True
        else:
            return False
コード例 #55
0
ファイル: BigramTest.py プロジェクト: Rukshani/Euler
def cal_bigram_probability(fileName, sentence1, words):
    sentence_tokens = word_tokenize(sentence1.lower())
    index = 0
    bigram_probability = 1.0
    unigram_counter = FreqDist(words)
    bigram_counter = FreqDist(bigrams(words))
    for index, items in enumerate(sentence_tokens):
        n = index + 1
        if n < len(sentence_tokens):
            bigram_probability_pair = (float(bigram_counter[items, sentence_tokens[n]]) / float(unigram_counter[items]))
            print("P("+sentence_tokens[n]+ "|"+ items+ ") = "+ str(bigram_probability_pair))
            bigram_probability *= bigram_probability_pair
    print (str(bigram_probability))
    return bigram_probability
コード例 #56
0
def testFunc():
    fw = open("./MZI/data.doc", "r", encoding="utf8");
    text = fw.read();
    tockens = getWordList(text)
    print(len(set(tockens)))
    from nltk.probability import FreqDist
    from nltk.util import bigrams
    fdist = FreqDist(w for w in tockens if len(w) > 1);
    fdist.tabulate(50);
    big = list(bigrams(w for w in tockens if len(w) > 1));
    print(big[:100]);
    fdist = FreqDist(str(w) for w in big);
    fdist.tabulate(10);
    fdist.plot(50)
コード例 #57
0
    def __init__(self, corpra, *args, **kwargs):

        if isinstance(corpra, basestring):
            self.words = nltk.word_tokenize(corpra)
        else:
            self.words = corpra.words(*args, **kwargs)

        self.pos_words = nltk.pos_tag(self.words)
        self.bgrams = bigrams(self.pos_words)
        self.freqdist = nltk.ConditionalFreqDist(self.bgrams)

        self.word_dict = {}
        self.pos_dict = {}
        self.punct = [',', '.', '!', ':', ';', '?', '--', '-', '"', "'", 's', '$', ',"']
        self.word_dict_builder()
コード例 #58
0
def norm_words(words):
	if not args.no_lowercase:
		words = [w.lower() for w in words]
	
	if not args.punctuation:
		words = [w.strip(string.punctuation) for w in words]
		words = [w for w in words if w]
	
	if stopset:
		words = [w for w in words if w.lower() not in stopset]
	
	if args.bigrams:
		return words + bigrams(words)
	else:
		return words