Ejemplo n.º 1
0
    def get_auxiliary_feature(self, current_sentence, idx):
        avec = []
        _aux_type = None
        _tmp_sen = current_sentence.words_list + ['none', 'none', 'none']
        _current_word = _tmp_sen[idx]
        _next_words = _tmp_sen[idx + 1:idx + 4]
        _next_words = [lemma(_next_word) for _next_word in _next_words]
        try:
            avec.append(self.aux_word_2_idx.index(_current_word))
        except ValueError:
            avec.append(len(self.aux_word_2_idx))
            self.aux_word_2_idx.append(_current_word)
        try:
            avec.append(self.aux_lemma_2_idx.index(lemma(_current_word)))
        except ValueError:
            avec.append(len(self.aux_lemma_2_idx))
            self.aux_lemma_2_idx.append(lemma(_current_word))
        if _next_words[0] == 'so' or _next_words[
                0] == 'likewise' or _next_words[1] == 'same' or _next_words[
                    1] == 'opposite' or _next_words[2] == 'opposite':
            _aux_type = 'so'
        else:
            _aux_type = 'modal' if lemma(
                _current_word) not in AUXILIARY_TYPE else lemma(_current_word)

        avec.append(AUXILIARY_TYPE.index(_aux_type))
        current_sentence.aux_type = _aux_type
        current_sentence.trigger = _current_word
        current_sentence.trigger_index = idx
        return avec
Ejemplo n.º 2
0
def term_frequencies(tokens, selected_entities, key_phrase_tokens):
    # lmtzr = WordNetLemmatizer()
    # stemmer = SnowballStemmer("english")
    punctuations = list(string.punctuation)
    entities = ""
    for entity in selected_entities:
        entities += str(entity[0]) + ' '
    entities = tokenize(entities.lower())

    tf = {}
    i = 0
    len_token = len(tokens)
    for token in tokens:
        # token = stemmer.stem(token)
        # token = lmtzr.lemmatize(token)
        if dictionary.check(token.lower()):
            token = en.lemma(token)
        if token not in stopwords.words(
                'english'
        ) and token not in punctuations and token not in hoax_stopwords and len(
                token) > 1 and token != "''" and token != "``":
            # print token, 1.0 * (len_token - i) / (len_token * 1.0)
            if i == 0:
                try:
                    tf[token] += 2.0 * (len_token - i) / (len_token * 1.0)
                except KeyError:
                    tf[token] = 2.0 * (len_token - i) / (len_token * 1.0)
            else:
                try:
                    tf[token] += 1.0 * (len_token - i) / (len_token * 1.0)
                except KeyError:
                    tf[token] = 1.0 * (len_token - i) / (len_token * 1.0)
        elif token == ".":
            i += 1

    for token in key_phrase_tokens:
        # token = stemmer.stem(token)
        # token = lmtzr.lemmatize(token)
        if dictionary.check(token.lower()):
            token = en.lemma(token)
        if token not in stopwords.words(
                'english'
        ) and token not in punctuations and token not in hoax_stopwords and len(
                token) > 1 and token != "''" and token != "``":
            # print token, 1.0 * (len_token - i) / (len_token * 1.0)
            if i == 0:
                try:
                    tf[token] += 2.0 * (len_token - i) / (len_token * 1.0)
                except KeyError:
                    tf[token] = 2.0 * (len_token - i) / (len_token * 1.0)
            else:
                try:
                    tf[token] += 1.0 * (len_token - i) / (len_token * 1.0)
                except KeyError:
                    tf[token] = 1.0 * (len_token - i) / (len_token * 1.0)
        elif token == ".":
            i += 1

    tf = sorted(tf.items(), key=operator.itemgetter(1), reverse=True)
    return tf
Ejemplo n.º 3
0
def VB_VB_correction(
        payload, raw_text,
        error_count):  # correct errors of type is-walking OR has-cooked
    if (payload.tag_[:2] != 'VB'):
        return raw_text, error_count
    nounBeforeVerb = False
    nounAfterVerb = False
    verbFound = False
    if (payload.text == 'is' or payload.text == 'was' or payload.text == 'are'
            or payload.text == 'were'):
        return raw_text, error_count

    for ch in payload.children:
        if (ch.tag_[:2] == 'VB'):
            verbFound = True
        if ((not verbFound) and (ch.dep_ == 'nsubj')):
            print(ch.lower_)
            nounBeforeVerb = True
        if (verbFound and (ch.dep_ == 'nsubj')):
            nounAfterVerb = True

        ifHave = False
        ifBeen = False
        if (ch.tag_[:2] == 'VB'):  # this might need to be removed
            dummy, error_count = VB_VB_VB_correction(ch, raw_text, error_count)
            try:
                if (ch.lower_ == 'has') or (ch.lower_ == 'have') or (ch.lower_
                                                                     == 'had'):
                    ifHave = True
                if (ch.lower_ == 'been' or payload.text == "been"):
                    ifBeen = True

                if (ifHave and ifBeen):
                    x = conjugate(verb=lemma(payload.text),
                                  tense=PAST + PARTICIPLE,
                                  mood=INDICATIVE,
                                  person=1,
                                  number=PL)
                elif (nounBeforeVerb
                      and ((ch.lower_ == 'is') or (ch.lower_ == 'are') or
                           (ch.lower_ == 'was') or (ch.lower_ == 'was') or
                           (ch.lower_ == 'were'))):
                    x = conjugate(verb=lemma(payload.text),
                                  tense=PRESENT,
                                  mood=INDICATIVE,
                                  aspect=PROGRESSIVE,
                                  person=1,
                                  number=PL)
                else:
                    x = payload.text

                if (x != payload.text):
                    error_count += 1
                raw_text = raw_text[:payload.
                                    idx] + raw_text[payload.idx:].replace(
                                        payload.text, x, 1)
                return raw_text, error_count
            except TypeError:
                return raw_text, error_count
    return raw_text, error_count
Ejemplo n.º 4
0
 def createConceptPair(self, filename, target_corpus):
     corpus_conceptPair = []
     for sentence, target_sentence in zip(self.docs[filename],
                                          target_corpus):
         sentence_conceptPair = []
         if len(target_sentence) > 0:
             target_sentence += [target_sentence[0]]
         for i in range(len(sentence)):
             if sentence[i] in target_sentence:
                 curr_word = lemma(sentence[i]).encode('utf-8')
                 prev_word = lemma(
                     target_sentence[target_sentence.index(sentence[i]) -
                                     1]).encode('utf-8')
                 next_word = lemma(
                     target_sentence[target_sentence.index(sentence[i]) +
                                     1]).encode('utf-8')
             else:
                 curr_word = lemma(sentence[i]).encode('utf-8')
                 prev_word = curr_word
                 next_word = curr_word
             #next_word = lemma(sentence[i+1]).encode('utf-8')
             #curr_word = lemma(sentence[i]).encode('utf-8')
             sentence_conceptPair.append([[curr_word, next_word],
                                          [curr_word, prev_word]])
             #sentence_conceptPair.append([ [next_word, None, curr_word], [curr_word, None, prev_word] ])
         corpus_conceptPair.append(sentence_conceptPair)
     return corpus_conceptPair
def getSynonyms(word, part):
    synonyms = []
    wordToTry = lemma(word) if part[0] == 'V' else word
    synList = dictionary.synonym(wordToTry)
    if synList is None:
        return [word]
    for syn in synList:
        if " " not in syn:
            if part == "VB" or part == "VBP":
                synonyms.append(lemma(syn))
            elif part == "VBD" and len(lexeme(syn)) > 3:
                synonyms.append(lexeme(syn)[3])
            elif part == "VBG" and len(lexeme(syn)) > 0:
                synonyms.append(lexeme(syn)[0])
            elif part == "VBN" and len(lexeme(syn)) > 3:
                synonyms.append(lexeme(syn)[-1])
            elif part == "VBZ" and len(lexeme(syn)) > 1:
                synonyms.append(lexeme(syn)[1])
            elif part == "NN" and syn[-2:] != "ss":
                synonyms.append(singularize(syn))
            elif part == "NNS":
                synonyms.append(pluralize(syn))
            else:
                synonyms.append(syn)
    return list(set(synonyms))
Ejemplo n.º 6
0
def sent_to_bin_q(sentence_tree):
    assert (sentence_tree.label() == SENTENCE)
    subj = sentence_tree[0]
    # if subjectless
    if subj.label() != NP:
        return tree_to_string(sentence_tree)
    assert (subj.label() == NP)
    vp = sentence_tree[1]
    assert (vp.label() == VP)
    head_verb = vp[0]
    assert (is_verb(head_verb.label()) and head_verb.label() != VP)
    verb = head_verb[0]

    # has auxiliary
    if is_modal(head_verb, vp) or lemma(verb) == 'be':
        uncap(subj)  # uncapitalize original subj (unless proper noun)
        return ' '.join([verb, tree_to_string(subj)] +
                        [tree_to_string(node) for node in vp[1:]])

    # no auxiliary, use do-insertion
    else:
        head_verb[0] = lemma(verb)  # convert head verb to infinitive
        uncap(subj)  # uncapitalize original subj (unless proper noun)
        return ' '.join([
            conjugate('do', get_inflection(head_verb)),
            tree_to_string(subj),
            tree_to_string(vp)
        ])
Ejemplo n.º 7
0
def testBasic():
    from pattern.en import referenced
    print referenced('hour')
    
    from pattern.en import conjugate, lemma, lexeme
    print lexeme('purr')
    print lemma('purring')
    print conjugate('purred', '3sg') # he / she / it
Ejemplo n.º 8
0
def testBasic():
    from pattern.en import referenced
    print referenced('hour')

    from pattern.en import conjugate, lemma, lexeme
    print lexeme('purr')
    print lemma('purring')
    print conjugate('purred', '3sg')  # he / she / it
Ejemplo n.º 9
0
def has_auxiliary(head_verb, head_verb_tag, vp_tags):
    if head_verb_tag == 'MD' or lemma(head_verb) in ['be', 'do']:
        return True
    if lemma(head_verb) in [
            'have'
    ] and len(vp_tags) > 2 and vp_tags[2].startswith('V'):
        return True
    return False
Ejemplo n.º 10
0
def get_conjugations(lem):
    vforms = []
    if lemma(lem) == 'be':
        vforms = [i for i in EXCEPTIONS]
    else:
        for ta in TENSE_ASPECTS:
            c = conjugate(lemma(lem), ta)
            vforms.append( c +'|||'+ ta)
    return vforms
Ejemplo n.º 11
0
def is_partial_match(query, table_names):
    query = lemma(query)
    table_names = [[lemma(x) for x in names.split(' ') ] for names in table_names]
    same_count = 0
    result = None
    for names in table_names:
        if query in names:
            same_count += 1
            result = names
    return result if same_count == 1 else False
Ejemplo n.º 12
0
 def lemmatize(self, word):
     if word in self._vocab._word2index:
         return word
     elif word in self._inverse_map.keys():
         return self._inverse_map[word]
     elif lemma(word) in self._vocab._word2index:
         return lemma(word)
     elif lemma(word) in self._inverse_map.keys():
         return self._inverse_map[lemma(word)]
     else:
         return self._vocab.index2word(0)
def get_verb_reduction(verb, tag):
    """Given string of existing verb, returns its corresponding reduction
    That's the verb itself if its lemma is in the top100, else its hash"""
    if lemma(verb.lower()) in literals.verbs:
        return verb.upper()
    if lemma(verb.lower()) in top100.verbs:
        return verb.upper()
    else:
        h = sha256(str(tenses(verb)).encode('utf_8')).hexdigest()
        result = tag + '_' + h
        return result
Ejemplo n.º 14
0
def decompose_verb(verb, verb_tag):
    logger.debug('verb \t %s verb_tag \t %s' % (verb, verb_tag))
    tense = verb_tense_dict[verb_tag]
    if conjugate('do', tense) == 'did':
        return_do = 'is'
        return return_do, verb
    else:
        return_do = conjugate('do', tense)
        # elif conjugate('do', tense) == does:
        print('verb', verb, verb_tag)
        print(conjugate('do', tense))
        print('lemma', lemma(verb))
        return return_do, lemma(verb)
Ejemplo n.º 15
0
    def search_answer(self, cnstrd_word_syn, wd_in_sent, key_wd_idx):
        """
        This function searches the constrainted word of the question
        Parameters:
            cnstrd_word_syn (list) - the list of the synonyms of the constrainted word in the question
            wd_in_sent (list) - word tokenization text
            key_wd_idx (int) - the position of the key word in the sentence
        return: the position of the constrainted word in the sentence of the text  
        """
        porter = nltk.PorterStemmer()
        lancaster = nltk.LancasterStemmer()
        #print cnstrd_word_syn
        for cw in cnstrd_word_syn:
            cw_seperate = []
            if '_' in cw:

                cw1 = cw.split('_')[0]
                cw2 = cw.split('_')[1]
                cw_seperate = [cw1, cw2]

                cw = ' '.join(cw.split('_'))

                cw_seperate.append(cw)
            #print(cw)
            for sent in wd_in_sent[key_wd_idx:]:
                #print(cw)
                #print(cw, sent)
                #print sent
                """
                if cw_seperate:
                    for c_s in cw_seperate:
                        if porter.stem(c_s.lower()) == porter.stem(sent.lower()) or lemma(c_s) == lemma(sent): #or sent.lower() in cw.lower() or lemma():
                            print("!!!!!!!!")
                            print(cw, sent)
                            print(wd_in_sent.index(sent))
                            return wd_in_sent.index(sent)
                """
                if porter.stem(cw.lower()) == porter.stem(
                        sent.lower()) or lemma(cw) == lemma(
                            sent):  #or sent.lower() in cw.lower() or lemma():
                    #print("!!!!!!!!")
                    #print(cw, sent)
                    #print(wd_in_sent.index(sent))
                    return wd_in_sent.index(sent)
                """
                elif cw_seperate:
                    for cw_s in cw_seperate:
                        if porter.stem(cw.lower()) == porter.stem(sent.lower()) or lemma(cw) == lemma(sent):
                            return wd_in_sent.index(sent)
                """
        return None
Ejemplo n.º 16
0
	def createConceptPair(self, user):
		sentences_conceptPair = []
		for sentence in self.docs[user]:
			sentence_conceptPair = []
			sentence += [sentence[0]]
			for i in range(len(sentence) - 1):
				#if sentence[i+1] == "'s":
				#	next_word = lemma(sentence[i+2]).encode('utf-8')
				#else:
				next_word = lemma(sentence[i+1]).encode('utf-8')
				curr_word = lemma(sentence[i]).encode('utf-8')
				sentence_conceptPair.append([next_word, None, curr_word])
			sentences_conceptPair.append(sentence_conceptPair)
		return sentences_conceptPair
def is_subkeys(term1, term2):
    def common_words(key1, key2):
        return len(set(key1) & set(key2))  #/len(key2)

    lt1 = lemma(term1)
    lt2 = lemma(term2)
    #    llt1 = lt1.split(" ")
    #    llt2 = lt2.split(" ")
    #    if common_words(llt1,llt2)== len(llt1) :
    #       return True
    #    else :
    #        return False
    #    return all(t in llt2  for t in llt1)
    return lt1 in lt2
Ejemplo n.º 18
0
def morphological_error(target, response):
	if response == pluralize(target): 
		#Checks for the plural form of the target.
		return True
	if response == comparative(target) :
		#Checks for the comparative form of the target. 
		return True
	if response == superlative(target): 
		#Checks for the superlative form of the target.
		return True
	if lemma(target) == lemma(response): 
		#Check to see if the target and response share a lemma.
		return True
	return False
Ejemplo n.º 19
0
 def createConceptPair(self, filename):
     sentences_conceptPair = []
     for sentence in self.docs[filename]:
         sentence_conceptPair = []
         sentence = list(sentence)
         sentence += [sentence[0]]
         for i in range(len(sentence) - 1):
             prev_word = lemma(sentence[i - 1]).encode('utf-8')
             next_word = lemma(sentence[i + 1]).encode('utf-8')
             curr_word = lemma(sentence[i]).encode('utf-8')
             sentence_conceptPair.append([[curr_word, next_word],
                                          [curr_word, prev_word]])
             #sentence_conceptPair.append([next_word, None, curr_word])
         sentences_conceptPair.append(sentence_conceptPair)
     return sentences_conceptPair
Ejemplo n.º 20
0
	def createFeatVector(self, word_relations_list):
		features = []
		for sentence in word_relations_list:
			sentence_len = len(sentence)
			feature = np.zeros((sentence_len, 96))
			emissions = np.zeros((sentence_len, 1))
			if sentence_len == 0:
				continue
			for i in range(len(sentence)):
				word, rel_1, rel_2, emission = sentence[i]
				#relation_weights = cn.search(lemma(word).encode('utf-8'))
				word = lemma(word).encode('utf-8')
				if word in self.sparse_dict:
					cn_weights = [self.sparse_dict[word][w][0]["weight"] for w in self.sparse_dict[word]]
					cn_rels    = [self.sparse_dict[word][w][0]["rel"]    for w in self.sparse_dict[word]]
					#cn_weights, cn_rels = zip(*self.sparse_dict[word])[:2]
					for j in range(len(cn_rels)):
						if cn_rels[j] in rel_1:
							rel_index = self.Rels.index(cn_rels[j])
							feature[i][rel_index] += cn_weights[j]
						if cn_rels[j] in rel_2:
							rel_index = self.Rels.index(cn_rels[j]+48)
							feature[i][rel_index] += cn_weights[j]
					emissions[i] += emission
			# Normalization	
			feature = normalize(feature, axis=1) * emissions
			features.append(feature)
		features = np.asarray(features)
		return features
Ejemplo n.º 21
0
def pretreat_a(data):
    global train_ans, valid_ans, anslist
    fw = open(path + '/data/' + data + '.ans', 'w')
    f = open(path + '/data/' + data + '.a')
    lines = f.readlines()
    for line in lines:
        line = line.replace('\n', '').split(' ')
        j = len(line)
        for i in range(j):  #'the','and','a','an','with'
            for div in ['the', 'and', 'a', 'an', 'with']:
                if line[i] == div:
                    line[i] = ''
            #if re.match(r'[0-9]+', line[i]):
            #line[i] = 'num'
            #elif re.match(r'[a-z][0-9]+', line[i]):
            #line[i] = 'pos'
            line[i] = lemma(line[i])
            if line[i] == 'have':
                line[i] = ''
            if i != j - 1:
                if line[i] != '' and line[i + 1] != '':
                    line[i] = line[i] + ' '
            else:
                line[i] = line[i] + '\n'
            fw.write(line[i])
    f.close()
    fw.close()
Ejemplo n.º 22
0
def with_best_data():
    format_text = ''
    word_labels = {}
    # 直接读文章,去掉标签,分词,词性还原,标记,输出
    for parent, dirnames, filenames in os.walk(train_source_dir):
        for filename in filenames:
            # 读文件
            source_fp = open(train_source_dir+filename)
            all_source_text = source_fp.read().decode("utf-8")  # 注意编码
            source_fp.close()
            # 去标签
            re_h = re.compile('</?\w+[^>]*>')  # HTML标签
            source_text = re_h.sub(" ", all_source_text)
            # 分词,词性还原,情感标记
            words = nltk.word_tokenize(source_text)
            print words
            for word in words:
                lemmed = lemma(word)
                polarity = sentiment(lemmed)[0]  # 不行啊,报错
                if polarity >= 0.5:
                    label = 2
                elif polarity <= -0.5:
                    label = 1
                else:
                    label = 0
                word_labels[lemmed] = label
                # print word, lemmed, label
    for word in word_labels:
        format_text += (word + '_' + str(word_labels[word]) + ' ')
    # 不对,搞成一句一行??
    # 写入文件
    write_file(format_text, output_filename)
Ejemplo n.º 23
0
 def _check_synonyms_existence(word, word_sets, wn_pos):
     for synsets in wn.synsets(word.lower(), pos=wn_pos):
         for name in synsets.lemma_names():
             if name in word_sets or wn.morphy(name) in word_sets or lemma(
                     name) in word_sets:
                 return True
     return False
Ejemplo n.º 24
0
 def lookup(self, word):
     if word in self._vocab._word2index:
         return word
     elif word in self._inverse_map.keys():
         return word
     else:
         try:
             base_word = lemma(word)
             if lemma(word) in self._vocab._word2index:
                 return lemma(word)
             elif lemma(word) in self._inverse_map.keys():
                 return lemma(word)
             else:
                 return self._vocab.index2word(0)
         except:
             return self._vocab.index2word(0)
Ejemplo n.º 25
0
    def superlative(self, word):
        '''
        Given a base-form word (Adj), return back a superlative form

        Args:
            word (str): base-form adj

        Raises:
            ValueError: [description]
            ValueError: [description]

        Returns:
            str: superlative form
        '''
        if word in self._word2index:
            return superlative(word)
        else:
            try:
                base_form_word = lemma(word)
                if base_form_word in self._word2index:
                    return superlative(base_form_word)
                else:
                    raise ValueError(
                        "Found the base-form for '{}': '{}'. But even the base-form not in vocabulary"
                        .format(word, base_form_word))
            except:
                raise ValueError(
                    "Can not found base-form for '{}'".format(word))
Ejemplo n.º 26
0
    def pluralize(self, word):
        '''
        Given base-form of the word, return back plural form of the word
        (For Noun only)
        Args:
            word (str): base-form of the word

        Raises:
            ValueError: The vocabulary does not contain the base-form
            ValueError: Can not find the base-form of the given word

        Returns:
            str: plural form of the word
        '''
        if word in self._word2index:
            return pluralize(word)
        else:
            try:
                base_form_word = lemma(word)
                if base_form_word in self._word2index:
                    return pluralize(base_form_word)
                else:
                    raise ValueError(
                        "Found the base-form for '{}': '{}'. But even the base-form not in vocabulary"
                        .format(word, base_form_word))
            except:
                raise ValueError(
                    "Can not found base-form for '{}'".format(word))
Ejemplo n.º 27
0
def clean_and_lemmatize_doc(doc, build_vocab):
    stripped_doc = []
    for char in doc:
        if (ord(char) >= 48) & (ord(char) <= 57):
            stripped_doc.append(char)
        elif (ord(char) >= 65) & (ord(char) <= 89):
            stripped_doc.append(char)
        elif (ord(char) >= 97) & (ord(char) <= 122):
            stripped_doc.append(char)
        elif ord(char) == 39:
            continue
        else:
            stripped_doc.append(" ")
    doc = "".join(stripped_doc).lower()
    doc = doc.split(" ")
    stripped_doc = []
    for word in doc:
        try:
            cleaned_word = lemma(word)
            if len(cleaned_word) > 0:
                stripped_doc.append(cleaned_word)
                if build_vocab == True:
                    vocab.add(cleaned_word)
        except:
            continue
    return stripped_doc
Ejemplo n.º 28
0
    def singularize(self, word):
        '''
        Given a base-form of noun, return a singular form
        (For Noun only)

        Args:
            word (str): base-form of noun

        Raises:
            ValueError: [description]
            ValueError: [description]

        Returns:
            str: singular form of noun
        '''
        if word in self._word2index:
            return singularize(word)
        else:
            try:
                base_form_word = lemma(word)
                if base_form_word in self._word2index:
                    return singularize(base_form_word)
                else:
                    raise ValueError(
                        "Found the base-form for '{}': '{}'. But even the base-form not in vocabulary"
                        .format(word, base_form_word))
            except:
                raise ValueError(
                    "Can not found base-form for '{}'".format(word))
Ejemplo n.º 29
0
    def updateTerms(self, line, w2vmodel):
        list_term = line.split('_')
        list_result = []

        whitelist = set(
            ['win', 'won', 'most', 'biggest', 'largest', 'fastest'])
        blacklist = set(['give', 'also'])
        stoplist = set(stopwords.words('english'))

        for term in list_term:
            if term in blacklist:
                continue
            if term not in whitelist and term in stoplist:
                continue
            # find
            lem = lemma(term)
            sing = singularize(term)

            if term in w2vmodel.vocab:
                list_result.append(term)
            elif lem in w2vmodel.vocab:
                list_result.append(lem)
            elif sing in w2vmodel.vocab:
                list_result.append(sing)
        return list_result
Ejemplo n.º 30
0
def lda_comparison(corpus_savepath):
    '''string -> lda topics
    corpus_savepath is the path to save the prepared corpus for lda'''

    #basic preprocessing and lemmatization almost like in lda2vec implementation
    texts = fetch_20newsgroups(subset='train').data
    texts = [unicode(d.lower()) for d in texts]
    texts = ["".join((char if char.isalpha() else " ") for char in text).split() for text in texts]
    texts = [stopwords.clean([lemma(i) for i in text[:1000]], "en") for text in texts]

    #creating frequency dictionary for tokens in text
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    #removing very infrequent and very frequent tokens in corpus
    texts = [[token for token in text if (frequency[token] > 10 and len(token) > 2 and frequency[token] < len(texts)*0.2)] for text in texts]

    #creating an LDA model
    dictionary = gensim.corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    gensim.corpora.MmCorpus.serialize(corpus_savepath, corpus)
    modelled_corpus = gensim.corpora.MmCorpus(corpus_savepath)
    lda = gensim.models.ldamodel.LdaModel(modelled_corpus, num_topics=20, update_every=100, passes=20, id2word=dictionary, alpha='auto', eval_every=5)

    #returning the resulting topics
    return lda.show_topics(num_topics=20, num_words=10, formatted=True)
Ejemplo n.º 31
0
def Content2wv(line):
    pat = re.compile(r'([A-z]+)')
    #分詞
    line = line.replace('\n', '')
    words = line.split(" ")
    vector = np.zeros((100, 100), dtype=np.float)
    #算向量
    for i in range(0, len(words)):
        #word = lemma(words[i])
        match = pat.findall(words[i])
        if match:
            try:
                vector += np.array(model.wv[match[0].lower()])
            except:
                try:
                    vector += np.array(model.wv[lemma(match[0])])
                except:
                    #print(lemma(match[0]))
                    vector += np.zeros((100, 100), dtype=np.float)
        else:
            tmp = np.full((100, 100), -1, dtype=float)
            vector += tmp
    #取平均
    tmpLi = []
    for i in range(len(vector)):
        for j in range(len(vector[i])):
            vector[i][j] = vector[i][j] / len(words)
        tmpLi = tmpLi + list(vector[i])
    return tmpLi
Ejemplo n.º 32
0
    def procesar_ejercicio_hiponimos(self, texto):
        items_ejercicio = []
        filtro = lambda x: es_sustantivo(x) or es_adjetivo(x)
        lista_palabras = obtener_palabras(filtro, texto)

        lista_palabras = list({each['token']: each
                               for each in lista_palabras}.values())
        for palabra in lista_palabras:
            palabra_token = lemma(palabra['token'])
            categorias = Categorias().listar_categorias()
            palabra_categoria = None
            for categoria in categorias:
                categoria_synset_id = categoria['synset_id']
                es_hiponimo = hip.es_hiponimo(palabra_token,
                                              categoria_synset_id)
                if es_hiponimo:
                    palabra_categoria = categoria['nombre']
            if palabra_categoria:
                # Evitar palabras duplicadas
                if not any(obj.palabra == palabra_token
                           for obj in items_ejercicio):
                    item = ItemEjercicioHiponimos(palabra_token,
                                                  palabra_categoria)
                    items_ejercicio.append(item)
        return items_ejercicio
 def _get_nouns(self, sentence):
     """
     input : String
     output: list
     Returns features(nouns) for each sentence.
     """
     doc = self.nlp(sentence)
     nouns = [unicode(lemma(str(word).lower())) for word in doc if word.pos == NOUN]
     return nouns
Ejemplo n.º 34
0
def basic_sentence_to_question(basic_sentence):
    sbj=' '.join(bits_to_words(basic_sentence['SBJ']))
    obj=' '.join(bits_to_words(basic_sentence['OBJ']))
    verb=' '.join(bits_to_words(basic_sentence['VP']))

    if verb=='is':
        return "What is "+sbj.lower()+"? "+obj

    return "What does "+sbj.lower()+" "+lemma(verb.lower())+"?"+" "+obj
Ejemplo n.º 35
0
	def createFeatVector(self, word_relations_list):
		sentence_len = len(word_relations_list)
		feature = np.zeros((sentence_len, 46))
		for i in range(len(word_relations_list)):
			word, rels = word_relations_list[i]
			relation_weights = cn.search(lemma(word).encode('utf-8'))
			for j in range(len(self.Rels)):
				if self.Rels[j] in relation_weights and self.Rels[j] in rels:
					feature[i][j] = relation_weights[self.Rels[j]]
		return feature
 def _get_nouns(self, review):
     """
     Returns features(nouns) from each sentence of a review.
     """
     review_features = []
     for sent in review:
         doc = self.nlp(sent)
         # noun_phrase = [np.text for np in doc.noun_chunks]
         nouns = [unicode(lemma(str(word).lower())) for word in doc if word.pos == NOUN]
         review_features.append(nouns)
     return review_features
Ejemplo n.º 37
0
def lemmatize_article(article):
    '''
    INPUT: article (str) - raw text from the article (where text has been lowered and punctuation removed already)
    OUTPUT: lemmatized_article - article text with all stopwords removed and the remaining text lemmatized
    '''
    # Load in stopwords from load_data
    stopwords = stop_words()
    # Load Dictionary to fix commonly mislemmatized words
    correct_lemma = fix_lemmatized_words()
    # Lemmatize article by running each word through the pattern.en lemmatizer and only including it in the resulting text if the word doesn't appear in the set of stopwords
    article = ' '.join([en.lemma(w) for w in article.split() if w not in stopwords])
    # Return the article text after fixing common mislemmatized words
    return ' '.join([correct_lemma[w] if w in correct_lemma else w for w in article.split()])
Ejemplo n.º 38
0
def extract_noun_phrases(body_part_name):    
    stop = nltk.corpus.stopwords.words('english')    
    filename = '/Users/rsteckel/tmp/Observable_body_parts-sentences-BODYPART1.tsv'
    
    df = pd.read_csv(filename, sep='\t', encoding='utf-8')
    df['lemmas'] = df['themeword'].apply(lambda x: lemma(x))
    
    sentences = df[ df['lemmas'] == body_part_name]['sentence'].tolist()
    
    phrases = []
    for sentence in sentences:
        ptree = parsetree(sentence)
        matches = search('NP', ptree)        
        for match in matches:
            filtered_np = [ word for word in match if word.string.lower() not in stop ]
            if len(filtered_np) > 0:
                phrases.append( (sentence, filtered_np) )
    
    return pd.DataFrame(phrases, columns=['sentence', 'phrase'])
Ejemplo n.º 39
0
print("")

# COMPARATIVE & SUPERLATIVE ADJECTIVES
# ------------------------------------
# The comparative() and superlative() functions give the comparative/superlative form of an adjective.
# Words with three or more syllables are simply preceded by "more" or "most".
for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]:
    print("%s => %s => %s" % (word, comparative(word), superlative(word)))
print("")

# VERB CONJUGATION
# ----------------
# The lexeme() function returns a list of all possible verb inflections.
# The lemma() function returns the base form (infinitive) of a verb.
print("lexeme: %s" % lexeme("be"))
print("lemma: %s" % lemma("was"))
print("")

# The conjugate() function inflects a verb to another tense.
# You can supply:
# - tense : INFINITIVE, PRESENT, PAST,
# - person: 1, 2, 3 or None,
# - number: SINGULAR, PLURAL,
# - mood  : INDICATIVE, IMPERATIVE,
# - aspect: IMPERFECTIVE, PROGRESSIVE.
# The tense can also be given as an abbreviated alias, e.g.,
# inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart.
from pattern.en import PRESENT, SINGULAR
print(conjugate("being", tense=PRESENT, person=1, number=SINGULAR, negated=False))
print(conjugate("being", tense="1sg", negated=False))
print("")
Ejemplo n.º 40
0
    #It matches anything from food to cat food, tasty cat food, the tasty cat food, etc.
    t = parsetree('tasty cat food')
    matches = search('DT? RB? JJ? NN+', ptree)
    for match in matches:
        print match
    print '\n'







filename = '/Users/rsteckel/tmp/Observable_body_parts-sentences-BODYPART1.tsv'
df = pd.read_csv(filename, sep='\t', encoding='utf-8')
df['lemmas'] = df['themeword'].apply(lambda x: lemma(x))


grby = df.groupby(['lemmas']).count()
sorted_df = grby.sort(['lemmas'], ascending=0)
bpdf = sorted_df[:3]

top_bps = set(bpdf['lemmas'].index.values)
df['topbp'] = df['lemmas'].apply(lambda x: x if x in top_bps else 'other')

records = []
for i,row in df.iterrows():
    try:
        if i % 100 == 0:
            print '%d of %d' % (i, len(df))        
            
Ejemplo n.º 41
0
# COMPARATIVE & SUPERLATIVE ADJECTIVES
# ------------------------------------
# The comparative() and superlative() commands give the comparative/superlative form of an adjective.
# Words with three or more syllables are simply preceded by "more" or "most".
for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]:
    print word, "=>", comparative(word), "=>", superlative(word)
print
print

# VERB CONJUGATION
# ----------------
# The lexeme() command returns a list of all possible verb inflections.
# The lemma() command returns the base form (infinitive) of a verb.
print "lexeme:", lexeme("be")
print "lemma:", lemma("was")

# The conjugate() command inflects a verb to another tense.
# The tense can be given as a constant, e.g. 
# INFINITIVE, PRESENT_1ST_PERSON_SINGULAR PRESENT_PLURAL, PAST_PARTICIPLE, ...
# or as an abbreviated alias: inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart.
print conjugate("being", tense="1sg", negated=False)

# Prefer the full constants for code that will be reused/shared.

# The tenses() command returns a list of all tenses for the given verb form.
# For example: tenses("are") => ['present 2nd person singular', 'present plural']
# You can then check if a tense constant is in the list.
# This will also work with aliases, even though they are not explicitly in the list.
from pattern.en import PRESENT_PLURAL
print tenses("are")
Ejemplo n.º 42
0
def words_same_sounds_score(w1, w2, pronunciations):
	"""
	Given 2 words and the pronunciation dictionary
	Returns a score for how alike they sound
	Works by looking at intersection of phonemes
	TODO:
	Currently only uses default pronunciation from CMU, ignores any variations
	Also should deal with words not in the CMU pronouncing dict
	"""
	import re
	# If words are double, make sure pronunciations for words are found separately and joined
	w1_split, w2_split = re.split('-|_| ',w1) , re.split('-|_| ',w2)
	p1_list, p2_list = [], []

	# Firstly, if w1 contains a word in w2 or vice versa, return score of 0
	from pattern.en import lemma
	w1_temp = [lemma(x) for x in w1_split]
	w2_temp = [lemma(x) for x in w2_split]
	overlap = [val for val in w1_temp if val in w2_temp]
	if overlap:
		return 0

	try:
		for w in w1_split:
			p1_list.extend(pronunciations[w][0])
		for w in w2_split:
			p2_list.extend(pronunciations[w][0])
	except:
		# Can't find words in CMU dict
		return 0

	# If we have pronunciations for each word we continue
	if p1_list and p2_list:
		
		# We will populate these scores
		exact_sounds_score, approx_sounds_score = 0, 0

		# Remove stresses, not interested in these
		def remove_number(x):
			if x[-1] in ('0','1','2'):
				return x[:-1]
			else:
				return x
		p1_no_stress = [remove_number(x) for x in p1_list]
		p2_no_stress = [remove_number(x) for x in p2_list]

		# Find the overlap in sounds
		overlap = [x for x in p1_no_stress if x in p2_no_stress]

		# Find proportion of original sounds covered by overlap
		prop_1 = len ( [x for x in p1_no_stress if x in overlap] )*1.0 /len(p1_no_stress)
		prop_2 = len ( [x for x in p2_no_stress if x in overlap] )*1.0/len(p2_no_stress)
		#print prop_1, prop_2
		
		if prop_1 + prop_2 > 0:
			exact_sounds_score = prop_1*prop_2*2.0/(prop_1+prop_2)

		# Now check with approximate sounds, same process (weight score lower though)
		p1_no_stress = [approx_sounds.get(x, x) for x in p1_no_stress]
		p2_no_stress = [approx_sounds.get(x, x) for x in p2_no_stress]

		# Find the overlap in approx sounds
		overlap = [x for x in p1_no_stress if x in p2_no_stress]
		prop_1 = len ([x for x in p1_no_stress if x in overlap])*1.0 /len(p1_no_stress)
		prop_2 = len ([x for x in p2_no_stress if x in overlap])*1.0/len(p2_no_stress)
		
		if prop_1 + prop_2 > 0:
			approx_sounds_score = prop_1*prop_2*2.0/(prop_1+prop_2) * 0.66   # Weighted down

		# Return the higher score of the two
		return max(exact_sounds_score, approx_sounds_score)
Ejemplo n.º 43
0
	def buildRules(self, fname = ".tmp_pos", debug = 0):
		#fname = ".tmp_pos"
		wlist = ["0"]
		taglist = []
		ind = 1
		changed = 0
		tverbs = []
		with open(fname, "r") as fid:
			for line in fid:
				if line == "\n":
					#print " ".join(wlist)
					new_str =  " ".join(taglist)
					#print new_str
					m = com1.search(new_str)
					if m!= None:
						wid = m.group(3).split("_")
						#print wlist[int(wid[0])]
						tverbs.append("BPA"+lemma(wlist[int(wid[0])]))

					m = com1_h.search(new_str)
					if m!= None:
						wid = m.group(3).split("_")
						#print wlist[int(wid[0])]
						tverbs.append("BPE"+lemma(wlist[int(wid[0])]))

					m = com2.search(new_str)
					if m!= None:
						wid = m.group(2).split("_")
						#print wlist[int(wid[0])]
						tverbs.append("BVP"+lemma(wlist[int(wid[0])]))
					m = com4.search(new_str)
					if m!= None:
						wid = m.group(2).split("_")
						#print wlist[int(wid[0])]
						tverbs.append("BVP"+lemma(wlist[int(wid[0])]))

					m = com7.search(new_str)
					if m!= None:
						wid = m.group(1).split("_")
						#print wlist[int(wid[0])]
						tverbs.append("AVP"+lemma(wlist[int(wid[0])]))

					m = com8.search(new_str)
					if m!= None:
						wid = m.group(1).split("_")
						#print wlist[int(wid[0])]
						tverbs.append("AVP"+lemma(wlist[int(wid[0])]))

					m = com13.search(new_str)
					if m!= None:
						wid = m.group(1).split("_")
						inid = m.group(2).split("_")
						#print wlist[int(wid[0])]
						tverbs.append("AVO"+wlist[int(wid[0])]+" "+wlist[int(wid[0])])

					m = com14.search(new_str)
					if m!= None:
						wid = m.group(1).split("_")
						inid = m.group(2).split("_")
						#print m.groups()
						tverbs.append("AVN"+wlist[int(wid[0])]+" "+wlist[int(inid[0])])
					m = com15.search(new_str)
					if m!= None:
						wid = m.group(1).split("_")
						inid = m.group(2).split("_")
						#print m.groups()
						tverbs.append("ANN"+wlist[int(wid[0])]+" "+wlist[int(inid[0])])

					#print "\n"
					wlist = ["0"]
					taglist = []
					ind = 1
				else:
					line = line.rstrip()
					l = line.split("\t")
					if l[0] == "-LRB-":
						l[0] = "("
						changed = 1
					elif l[0] == "-RRB-":
						l[0] = ")"
						changed = 1
					#elif l[0] in ["IBM", "ibm"]:
					elif l[0] in self.seed:
						l[1] = "BOUGHT"
						changed = 1
					elif l[0] in ["is", "are", "was", "were", "been","have", "has", "had"]:
						l[1] = l[0]
						changed = 1
					wlist.append(l[0])
					if changed == 0:
						l[1] = str(ind)+"_"+l[1]
					taglist.append(l[1])
					changed = 0
					ind += 1
		if debug == 1 :
			for s in set(tverbs):
				if tverbs.count(s) > 1:
					print s
		else:
			fout = open(self.outFile, "w")
			for s in set(tverbs):
				if tverbs.count(s) > 1:
					fout.write("%s\n"%s)
			fout.close()
Ejemplo n.º 44
0
def find_similarity(query):

	### Finding Patterns
        a=query.split(",")
	files='/home/sandy/SEM-6/IRE/Project/Mine/htmloutput.html'
	ff=open(files,"r")
	data=ff.read()
	data=data.split('**************  New PAGE *********************')
	while '' in data: data.remove('')
	patternout=[]
	for i in data:
		patternout=patternout+expandUsingPatterns(i,a)

	patternout=Counter(patternout)
	
	### Word2vec model
	model1 = word2vec.Word2Vec.load_word2vec_format('./word2vec/wiki.model.bin', binary=True)
	print "Model loaded"
	print "\n"
#	a=query.split(",")
	scores={}
	f=open('./indexfile','r')
	for i in a:
		for line in f:
			line=line.strip("\n")
			try:
				sc=model1.similarity(i,line)
			except:
				sc=0
#	print sc
			try:
				scores[line]=scores[line]+sc
			except:
				scores[line]=sc

	sorted_x = sorted(scores.items(), key=operator.itemgetter(1),reverse=True)
	k=0
	print "Printing Results"
	print "\n"
	flag=0
	output=[]
	for key,value in sorted_x:
		if key in a:
			continue
		for j in a:
			if en.lemma(key)==j or en.lemma(j)==key:
				flag=1
		if flag==1:
			flag=0
			continue
		for ll in output:
			if en.lemma(key)==ll or en.lemma(ll)==key:
				flag=1
		if flag==1:
			flag=0
			continue
		k=k+1
		output.append(key)
		if k==40:
			break
	cnt=0
	final=[]
	for res in output:
		if res in patternout.keys():
			cnt=cnt+1
			final.append(res)
			print cnt,":",res
	 	if cnt==10:
			break
	if cnt<10:
	  	for res in output:
	  		if res not in final:
	  			cnt=cnt+1
	  			print cnt,":",res
			if cnt==10:
				break
	  	
	f.close()
	ff.close()
Ejemplo n.º 45
0
 def test_lemma(self):
     # Assert the infinitive of "weren't".
     v = en.lemma("weren't")
     self.assertEqual(v, "be")
     print "pattern.en.inflect.lemma()"
Ejemplo n.º 46
0
# COMPARATIVE & SUPERLATIVE ADJECTIVES
# ------------------------------------
# The comparative() and superlative() functions give the comparative/superlative form of an adjective.
# Words with three or more syllables are simply preceded by "more" or "most".
for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]:
    print(word, "=>", comparative(word), "=>", superlative(word))
print()
print()

# VERB CONJUGATION
# ----------------
# The lexeme() function returns a list of all possible verb inflections.
# The lemma() function returns the base form (infinitive) of a verb.
print("lexeme:", lexeme("be"))
print("lemma:", lemma("was"))
print()

# The conjugate() function inflects a verb to another tense.
# You can supply: 
# - tense : INFINITIVE, PRESENT, PAST, 
# - person: 1, 2, 3 or None, 
# - number: SINGULAR, PLURAL,
# - mood  : INDICATIVE, IMPERATIVE,
# - aspect: IMPERFECTIVE, PROGRESSIVE.
# The tense can also be given as an abbreviated alias, e.g., 
# inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart.
from pattern.en import PRESENT, SINGULAR
print(conjugate("being", tense=PRESENT, person=1, number=SINGULAR, negated=False))
print(conjugate("being", tense="1sg", negated=False))
print()
def preprocess_debateIII(debate):      #lemmatises text
	lemmatised_words = [lemma(w) for w in debate]

	return lemmatised_words
#Indefinite article
print article('university')
print article('hour')

print referenced('university')
print referenced('hour')


#singularity
print pluralize('child')
print singularize('wolves')

#
print 
print lexeme('run')
print lemma('running')
print conjugate('purred', '3sg')
print PAST in tenses('purred') # 'p' in tenses() also works.
print (PAST, 1, PL) in tenses('purred') 

print 'Quantification'

print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken'])
print quantify('carrot', amount=90)
print quantify({'carrot': 100, 'parrot': 20})

print 'ngrams'
print ngrams("I am eating a pizza.", n=2)


#parse
Ejemplo n.º 49
0
 def build_analyzer(self):
     analyzer = super(CountVectorizer, self).build_analyzer()
     return lambda doc: (en.lemma(word) for word in analyzer(doc) if str.isdigit(unicodedata.normalize('NFKD', word).encode('ascii','ignore')) == False)
Ejemplo n.º 50
0
def transform(text):
    transformations = []

    # text
    text = text.lower() + " "

    # change "going to do something" -> "do something", while preserving "going to a thing"
    tags = tag(text)
    if tags[0][0] == "going" and tags[1][0] == "to" and tags[2][1] == "VB":
        text = " ".join(text.split()[2:]) + " "

    # transform the verb
    orig_verb = text.split()[0]

    if "f**k" in orig_verb:
        transformations += [(orig_verb, "")]
        try:
            orig_verb = text.split()[1]
        except IndexError:
            return ''

    elif "something" in orig_verb or "anything" in orig_verb:
        return ''

    new_verb = conjugate(lemma(orig_verb), person=3)

    # weird "lies" bug?
    if new_verb == 'layers':
        new_verb = 'lies'

    transformations += [(orig_verb, new_verb)]

    # transform first person to third
    transformations += [(" me ", " u ")]
    transformations += [(" my ", " ur ")]

    transformations += [(" i'm ", " ur ")]
    transformations += [(" im ", " ur ")]
    transformations += [(" i am ", " ur ")]
    transformations += [(" i ", " u ")]
    transformations += [(" i ", " u ")]
    transformations += [(" i've ", " u've ")]
    transformations += [(" ive ", " u've ")]
    transformations += [(" i'd ", " u'd ")]
    transformations += [(" id ", " u'd ")]

    transformations += [(" we ", " u ")]
    transformations += [(" ours ", " urs ")]
    transformations += [(" our ", " ur ")]
    transformations += [(" us ", " ur ")]

    # transform third person to gender-neutral
    transformations += [(" his ", " her ")]
    transformations += [(" him ", " her ")]
    transformations += [(" her ", " her ")]
    transformations += [(" he ", " she ")]
    transformations += [(" she ", " she ")]
    transformations += [(" he's ", " she's ")]
    transformations += [(" she's ", " she's ")]
    transformations += [(" hes ", " she's ")]
    transformations += [(" shes ", " she's ")]

    transformations += [(" n't ", " not ")]

    for orig, repl in transformations:
        text = text.replace(orig, repl)

    return text.strip()
Ejemplo n.º 51
0



from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
lmtzr.lemmatize('humidity')


from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
st.stem('luminous') 



lemma('humidity')

frames = fn.frames_by_lemma(r'skin')
for f in frames:
    print '%s - %s\n' % (f.name, f.definition)

fn.lexical_units(r'')
    
fn.frames_by_lemma(r'(?i)a little')    
    




for f in ('reflect', 'bank'):
    taxonomy.append(f, type='angle')