Ejemplo n.º 1
0
def tag_text_en(tokens, tokens_span):
    """Receive tokens and spans and return tuple list with tagged tokens"""
    tagger = PerceptronTagger()
    tags = []
    for i, tagged in enumerate(tagger.tag(tokens)):
        tags.append(tagged + (tokens_span[i], []))
    return tags
Ejemplo n.º 2
0
def ap(train_path, test_path):
    modelref = 'ap-' + md5(('ap///' + train_path).encode()).hexdigest() + '.pickle'

    test_sentences = list(gen_corpus(test_path))

    if not isfile(modelref):
        start = perf_counter()
        training_sentences = list(gen_corpus(train_path))

        ap_model = PerceptronTagger(load=False)
        ap_model.train(list(convert_sents_to_zipped(training_sentences)), save_loc=modelref)
        end = perf_counter()
        print('Training took {} ms.'.format(int((end - start) * 1000)))
    else:
        ap_model = PerceptronTagger(load=False)
        ap_model.load(modelref)
        print('Model loaded from file.')

    # Evaluation
    start = perf_counter()
    y_pred, y_true = [], []
    for words, tags in test_sentences:
        y_pred.extend(y for x, y in ap_model.tag(words))
        y_true.extend(tags)

    end = perf_counter()
    print('Testing took {} ms.'.format(int((end - start) * 1000)))

    for l in classification_report(y_true, y_pred).split('\n'):
        print(l)
Ejemplo n.º 3
0
def pos_titles_from(input_path, output_path = None, options = None):
    finput, foutput = get_streams(input_path, output_path)
    skip, end = get_options(options)
    tokenizer = Tokenizer()
    tagger = PerceptronTagger()
    line_counter = 0
    skipped_lines = 0
    for line in finput:
        log_advance(1000000, line_counter)
        line_counter += 1
        if line_counter <= skip:
            continue
        if end and line_counter > end:
            break
        try:
            paper_id, title = get_fields(line)
            if is_english(title):
                print >> foutput, paper_id
                tokens = tokenizer.tokenize(title)
                for token in tagger.tag(tokens):
                    print >> foutput, token[0], token[1]
                print >> foutput
            else:
                skipped_lines += 1
        except:
            print >> sys.stderr, "Error:", line, sys.exc_info()
    log_nlines(line_counter, skipped_lines)
Ejemplo n.º 4
0
def Top_K_verbs(k, df, text_col, class_col, plot=False):
    vect = TfidfVectorizer()
    vect.fit(df[text_col])
    tfidf_df = pd.DataFrame(vect.transform(df[text_col]).toarray())
    tfidf_df.columns = vect.get_feature_names()
    tfidf_T = tfidf_df.transpose()
    tagger = PerceptronTagger()
    tfidf_T['pos'] = tagger.tag(tfidf_T.index)
    tfidf_T = tfidf_T[tfidf_T['pos'].apply(
        lambda tup: tup[1] in ['VB', 'VBD', 'VBG', 'VBN'])]
    tfidf_df = tfidf_T.drop(['pos'], axis=1).transpose()
    top_k_by_class = dict()
    for v in df[class_col].value_counts().index:
        freq_in_class = tfidf_df[df[class_col] == v].sum(axis=0).sort_values(
            ascending=False)
        frac_in_class = freq_in_class / freq_in_class.sum()
        top_k_by_class[v] = frac_in_class[:k].index

        if plot:
            print('the top {} frequent nouns for class {}:'.format(k, v))
            plt.figure(figsize=(5, 10))
            sns.barplot(y=frac_in_class[:k].index, x=frac_in_class[:k])
            plt.xlabel('fraction')
            plt.show()

    return (top_k_by_class)
Ejemplo n.º 5
0
 def test_perceptron_tagging(self):
     sentence = "This is a test sentence to test if the testing works."
     tokens = word_tokenize(sentence)
     pt = PerceptronTagger(load=True)
     tag_result1 = [x[1] for x in pt.tag(tokens)]
     pt2 = perctagger()
     pt2.load()
     tag_result2 = pt2.tag(tokens)
     self.assertListEqual(tag_result1, tag_result2)
def pos_sequence_from(keyphrase, tags):
    """Receive keyphrase dict and return list of tags"""
    pos_sequence = list(map(lambda i: tags[i][1], keyphrase["tokens-indices"]))
    # Special case when tokenization don't match with annotation
    if pos_sequence == []:
        tagger = PerceptronTagger()
        keyphrase_tags = tagger.tag(keyphrase['keyphrase-text'].split())
        pos_sequence = list(map(lambda t: t[1], keyphrase_tags))
    return pos_sequence
class HmmSeqRecognizer(object):
  def __init__(self):
    self.hmm_models = []
    self.n_hmm = 0
    self.hmm2idx = {}
    self.idx2hmm = {}
    self.tagger = PerceptronTagger()
    return

  def batch_test(self, samples, label):
    tp,ns = 0,len(samples)
    for i in xrange(ns):
      idx = self.predict_sample(samples[i])
      if idx==label: tp+=1
    return tp,float(tp)/ns

  def predict_sample(self, sample):
    sample = [sample]
    probs = [ model.test(sample) for model in self.hmm_models ]
    return probs.index(max(probs))

  def predict_sentence(self, sentence):
    sample =  [[ tag for _,tag in self.tagger.tag(word_tokenize(sentence)) ]]
    probs = [ model.test(sample) for model in self.hmm_models ]
    return probs.index(max(probs))

  def add_model(self, name, model):
    self.hmm_models.append(model)
    self.hmm2idx[name] = self.n_hmm
    self.idx2hmm[self.n_hmm] = name
    self.n_hmm += 1

  def new_hmm(self, name, datapath, nhs, ne):
    print '=> adding HMM model \'%s\'...' % name
    hmm_model = HmmModel(nhs)
    hmm_model.train(datapath,ne)
    self.add_model(name, hmm_model)
    print '|  done'
    return

  def save_hmm(self, name, hmm_path):
    print '=> saving HMM model \'%s\'...' % name
    f = open(hmm_path, 'wb')
    pickle.dump(self.hmm_models[self.hmm2idx[name]], f)
    f.close()
    print '|  done'
    return

  def load_hmm(self, name, hmm_path):
    # print '=> adding HMM model \'%s\'...' % name
    f = open(hmm_path, 'rb')
    hmm_model = pickle.load(f)
    f.close()
    self.add_model(name, hmm_model)
    # print '|  done'
    return
Ejemplo n.º 8
0
class Tagger(AbstractTagger):
    def __init__(self):
        self._tagger = PerceptronTagger(load=False)
        self._name = 'nltkperceptron'
        self._model_name = "nltkperceptron"
        self._result = None
        super().__init__()

    def _save_model(self, fpath):
        with open(fpath, 'wb') as f:
            dill.dump(self._tagger, f)

    def load(self, path=''):
        if path == '':
            self._load_model(path)
        else:
            mpath = os.path.join(path, self.model_name)
            self._load_model(mpath)

    def _load_model(self, fpath):
        if fpath == '':
            self._tagger = PerceptronTagger(load=True)
        else:
            with open(fpath, 'rb') as f:
                self._tagger = dill.load(f)

    def tag(self, data):
        res = self._tagger.tag(data)
        return [x[1] for x in res]

    def train(self, data):
        # Reset tagger.
        self._tagger = PerceptronTagger(load=False)
        self._tagger.train(data)

    @property
    def produces_temp_data(self):
        return False

    @property
    def requires_additional_params(self):
        return False

    def set_additional_params(self, options):
        pass

    def add_temp_dir(self, options):
        pass

    @property
    def model_name(self):
        return self._model_name

    @property
    def name(self):
        return self._name
def pos_tokenizer(text):
    word_tokens = tokenize(text)
    # using pretrained model to tag all tokens
    pretrained_tagger = PerceptronTagger(load=True)
    results = pretrained_tagger.tag(word_tokens)
    # collecting pos from resulting tuples
    pos_tokens = []
    for word_pos in results:
        pos_tokens.append(word_pos[1])
    return pos_tokens
Ejemplo n.º 10
0
def pos_per_line(text_file):
    try:
        tokenizer = Tokenizer()
        #pos
        tagger = PerceptronTagger()
        for s in text_file:
            tokens = tokenizer.tokenize(s)
            #print " ".join([" ".join(token)  for token in tagger.tag(tokens)])
            print " ".join([token[1]  for token in tagger.tag(tokens)])
    except:
        print >> sys.stderr, "Error pos_per_line(text_file): ", sys.exc_info()
Ejemplo n.º 11
0
class NLTK(BaseTagger):
    def __init__(self):
        import nltk
        nltk.download('averaged_perceptron_tagger')

        from nltk.tag.perceptron import PerceptronTagger

        self.inst = PerceptronTagger()

    def __call__(self, *args, **kwargs):
        return self.inst.tag(args[0])
Ejemplo n.º 12
0
def ie_preprocess(document):

    tagger = PerceptronTagger()
    tagged = []
    sentences = document.split("\n")
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    for sent in sentences:
        tagged_tokens = tagger.tag(sent)
        tagged.append(tagged_tokens)

    return tagged
Ejemplo n.º 13
0
class NltkTagger(Tagger):
    '''Require maxtree'''

    #import nltk
    def __init__(self, *args, **kwargs):
        self.tagr = PerceptronTagger()
        super(self.__class__, self).__init__(*args, **kwargs)

    #~ def __start__(self):
    #~ self.tagr =  PerceptronTagger()

    def tag_tokens(self, tokens, single=True):
        return self.tagr.tag(tokens)
Ejemplo n.º 14
0
    def _perform_analysis(self, tokenized_sents):
        res = []

        if len(self.precalced_data):
            return self.precalced_data
        else:
            for tokens in tokenized_sents:
                tagger = PerceptronTagger()
                tags = tagger.tag(tokens)

                res += tags

        self.precalced_data = res

        return res
Ejemplo n.º 15
0
class NERTagger:
    def __init__(self):
        self.pos_tagger = PerceptronTagger()

    def tag(self, tokens):
        tree = nltk.ne_chunk(self.pos_tagger.tag(tokens))
        tagged_tokens = []
        for t in tree:
            if type(t) == nltk.tree.Tree:
                label = t.label()
                for token in t:
                    tagged_tokens.append((token[0], label))
            else:
                tagged_tokens.append(t)
        return tagged_tokens
Ejemplo n.º 16
0
def test_lesk():
    words = get_sample_words()
    print("Words =%s" % words)
    tagger = PerceptronTagger()
    tags = tagger.tag(words)
    print('Tags=%s' % tags[:5])
    lemmatizer = WordNetLemmatizer()
    for word, tag in tags:
        pos = get_wordnet_pos(tag)
        if pos is None:
            continue
        # print("word=%s,tag=%s,pos=%s" %(word, tag, pos))
        lemma = lemmatizer.lemmatize(word, pos)
        # print("Lemma=%s" %lemma)
        synset = lesk(words, lemma, pos)
        if synset is None:
            print('No synsetid for word=%s' % word)
        else:
            print('word=%s, synsetname=%s, synsetid=%d' % (word, synset.name(), synset.offset()))
def pos_title_abstract(resource):
    xml_file = resource[0]
    pos_data = []
    try:
        xmldoc = minidom.parse(xml_file)
        elements_title = xmldoc.getElementsByTagName("Title")
        title =  elements_title.item(0).childNodes[0].nodeValue
        elements_list = xmldoc.getElementsByTagName("S")
        sentences = [e[0].nodeValue for e in [element.childNodes for element in elements_list] if e[0].nodeType == e[0].TEXT_NODE]
        sentences.insert(0, title)
        #raw text 
        txt_file = change_file_extention(resource[1], "xml", "txt")
        save_to_file(txt_file, sentences)
        #pos
        tagger = PerceptronTagger()
        for s in sentences:
            tokens = word_tokenize(s)
            pos_data.append(tagger.tag(tokens))
        pos_file = change_file_extention(resource[1], "xml", "pos")
        save_to_file(pos_file, pos_data)
    except:
        print >> sys.stderr, "Error pos_title_abstract:", resource, sys.exc_info()

    return pos_data 
Ejemplo n.º 18
0
 def get_instances(self, folder):
     instances = []
     labels = set()
     tagger = PerceptronTagger() # load nltk perceptron just once to speed up tagging
     with io.open(folder, encoding="utf-8") as f:
     # with open(folder) as f:
         for line in f:
             # line = unicode(line).encode("utf-8")
             line_split = line.rstrip().split("\t")
             if len(line_split) != 3:
                 continue
             id, text, label = line_split
             id = id.rstrip(":")
             text = re.sub('[#]', '', text.rstrip())
             label = re.sub('[^a-z]', '', label)
             inst = Instance(text, label)
             inst_tokenized = word_tokenize(text)
             inst_tagged = tagger.tag(inst_tokenized)
             for tokentag in inst_tagged:
                 token = Token(tokentag[0], tokentag[1])
                 inst.add_token(token)
             instances.append(inst)
             labels.add(label)
     return instances, labels
Ejemplo n.º 19
0
class TextAnalyser(object):
    """Text analyser"""

    MULTICLASS_NE_CHUNKER = \
        "chunkers/maxent_ne_chunker/english_ace_multiclass.pickle"

    def __init__(self, keyword_stop_list=None):
        self.__keyword_stop_list = keyword_stop_list

        self.__pos_tagger = PerceptronTagger()
        self.__ne_chunker = nltk_data_load(self.MULTICLASS_NE_CHUNKER)

    def analyse_file(self, filename):
        """Analyse the contents of a file

        :param str filename: the path to a file
        :rtype: TextAnalysisResult
        :return: the analysis result
        """
        with open(filename, "r") as f:
            return self.analyse(f.read())

    def _calculate_text_statistics(self, sentences, sentence_words):
        """Calculate the text statistics

        :param list[str] sentences: a list with the text sentences
        :param list[list[str]] sentence_words: a list with the sentences that
            have been tokenized into separate words
        :rtype: TextStatistics
        :return: the calculated text statistics
        """
        words = list(itertools.chain(*sentence_words))

        sentence_word_counts = np.array(
            [len(sentence) for sentence in sentence_words])

        return TextStatistics(
            sentence_count=len(sentences),
            word_count=len(words),
            mean_sentence_word_count=float(sentence_word_counts.mean()),
            median_sentence_word_count=float(np.median(sentence_word_counts)),
            min_sentence_word_count=int(sentence_word_counts.min()),
            max_sentence_word_count=int(sentence_word_counts.max()),
            average_sentence_word_count=float(
                np.average(sentence_word_counts)),
            sentence_word_count_std=float(sentence_word_counts.std()),
            sentence_word_count_variance=float(sentence_word_counts.var()))

    def _extract_named_entities(self, sentence_words):
        """Extract the named entities from the sentences

        The result from this method is a dictionary with the named entity types
        as keys and a set of the names entities as values

        :param list[list[str]] sentence_words: a list with the sentences that
            have been tokenized into separate words
        :rtype: dict[str: set]
        :return: the extracted dictionary words
        """
        tagged_sentences = [
            self.__pos_tagger.tag(sentence) for sentence in sentence_words
        ]
        chunked_sentences = [
            self.__ne_chunker.parse(sentence) for sentence in tagged_sentences
        ]

        named_entities = defaultdict(set)
        for sentence in chunked_sentences:
            for item in sentence:
                if isinstance(item, Tree):
                    ne_type = item.label()
                    entity = " ".join([
                        entity_component[0]
                        for entity_component in item.leaves()
                    ])

                    named_entities[ne_type].add(entity)

        return dict(named_entities)

    def analyse(self, text):
        """Analyse the given text

        :param str text: the text to analyse
        :rtype: TextAnalysisResult
        :return: the analysis result
        """
        readability_scores = calculate_readability_scores(text)

        keywords = None
        if isinstance(text, str) and len(text) != 0:
            keywords = extract_keywords(
                text=text, keyword_stop_list=self.__keyword_stop_list)

        sentences = sent_tokenize(text)
        sentence_words = [word_tokenize(sentence) for sentence in sentences]

        statistics = self._calculate_text_statistics(sentences, sentence_words)
        summary = create_summary(text)
        named_entities = self._extract_named_entities(sentence_words)

        return TextAnalysisResult(text=text,
                                  keywords=keywords,
                                  readability_scores=readability_scores,
                                  statistics=statistics,
                                  summary=summary,
                                  named_entities=named_entities)
Ejemplo n.º 20
0
https://github.com/evanmiltenburg/Dutch-tagger

We got a POS tagger and a CHUNK tagger. In combination they can be used to apply NER...


"""

import os
import nltk

from nltk.tag.perceptron import PerceptronTagger
from nltk.corpus import alpino as alp  # Trained on ALP data.

tagger = PerceptronTagger(load=False)
os.chdir(r'D:\nlp_lib')
tagger.load('model.perc.dutch_tagger_small.pickle'
            )  # I don't know the source of training data

# Tag a sentence.
tagger.tag('Alle vogels zijn nesten begonnen , behalve ik en jij .'.split())

training_corpus = alp.tagged_sents()
unitagger = nltk.tag.UnigramTagger(training_corpus)
bitagger = nltk.tag.BigramTagger(training_corpus, backoff=unitagger)
perctagger = PerceptronTagger(load=True)  # What does load=True mean??
perctagger.train(training_corpus)

sent = 'NLTK is een goeda taal voor NLP'.split()
bitagger.tag(sent)
unitagger.tag(sent)
perctagger.tag(sent)
Ejemplo n.º 21
0
    ('clf', OneVsRestClassifier(LinearSVC()))])
print "Classifier pipeline initialized."
labels = []
data = []
print "Processing the training file."
with open (file_path, 'r') as readFile :
    for row in readFile :
        arr = []
        sentences = []
        arr = row.split(",,,")
        arr[1] = arr[1].strip(' ')
        arr[1] = arr[1].strip('\n')
        labels.append(arr[1])
        sentences = nltk.sent_tokenize(arr[0])
        tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
        tagged_sentences = [tagger.tag(sentence) for sentence in tokenized_sentences]
        print tagged_sentences
        ner_sentence= [nltk.chunk.ne_chunk(pts) for pts in tagged_sentences]
        sent = ''
        for tokens in ner_sentence :
            word = ""
            for t in tokens :
                m = re.search('\(\'(.+?)\', \'(.+?)\'\)', str(t))
                e = re.search('\(([A-Z]+?) (.+?)\)', str(t))
                if m:
                    word = m.group(1)
                    pos = m.group(2)
                    if pos == "JJ" :
                        word = "JJ"
                    elif pos == "CC" :
                        word = "CC"
Ejemplo n.º 22
0
		titles+=[title]
		bodies+=[standard_body(body)]
	else:
		titles+=['Unknown']
		bodies+=[standard_body(story)]
		
### -------------------------- ###
#      Tokenize and tag text     #
### -------------------------- ###
stories = pd.DataFrame({'title':titles,'body':bodies})
stories['sents'] = stories['body'].map(lambda x: nltk.sent_tokenize(x))
stories['words'] = stories['sents'].map(lambda x: [[w.lower() for w in nltk.word_tokenize(s)] for s in x])

from nltk.tag.perceptron import PerceptronTagger
tagger = PerceptronTagger()
stories['tags'] = stories['words'].map(lambda x: [tagger.tag(s) for s in x])

### -------------------------- ###
#  Categorize tags into genres   #
### -------------------------- ###
# Note: Standards replace 'TG' with '{CLASS}' i.e. 'NN' -> '{ANML}'
""" Genres:
 - occupation
 - animals: is_animal
 - body parts: 
 - exclamation
 - food
 - location
"""
is_animal = lambda x: is_hyper_of(x,'animal')
is_bodypart = lambda x: is_hyper_of(x,'body_part')
                                type_indexes_tmp = ann_items[1].split(" ")
                                type_indexes = type_indexes_tmp[0:2] + type_indexes_tmp[3:]
                            else:
                                type_indexes = ann_items[1].split(" ")
                            type_indexes[1] = int(type_indexes[1])
                            type_indexes[2] = int(type_indexes[2])
                            indexes_kp_tmp.setdefault(type_indexes[1], -1)
                            if indexes_kp_tmp[type_indexes[1]] < type_indexes[2]:
                                indexes_kp_tmp[type_indexes[1]] = type_indexes[2]
                            ann_text = ann_items[2]
                            tokens = tokenizer.tokenize(ann_text)
                            if without_types:
                                annotation_type = 'KeyPhrase'
                            else:
                                annotation_type = type_indexes[0]
                            pos_tags = [t + (annotation_type,)  for t in tagger.tag(tokens)]
                            if pos_tags:
                                pos_tags[0] = pos_tags[0][0:2] + ("B-" + pos_tags[0][2],)
                                if debug:
                                    print >> sys.stderr, pos_tags
                            annotations[" ".join([str(ti) for ti in type_indexes[1:]])] = pos_tags
                            #print >> ann_ext_file, " ".join([str(ti) for ti in type_indexes]) + "\t" + ann_text + "\t" + pos_tags
                    ann_file.close()
                    #ann_ext_file.close()

                    if debug:
                        pass
                        #print indexes_kp_tmp
                        #print annotations

                    file_text = os.path.join(dirname, f[:-4] + ".txt")
Ejemplo n.º 24
0
# 	for (index,line) in enumerate(data_file):
# 		line=line.decode('utf-8','ignore') # you have to decode the line using the corresponded coding!
# 		sents = nltk.sent_tokenize(line)
# 		nltk.pos_tag_sents(sents) # WRONG!!!!
# 		print index

# WAY-2: Faster  20s/1000
tagger = PerceptronTagger() 
with open(input_filename) as data_file:
	for (index,line) in enumerate(data_file):
		line=line.decode('utf-8','ignore') # you have to decode the line using the corresponded coding!
		# sents = nltk.sent_tokenize(line)
		# print sents
		# sentences_pos=tagger.tag_sents(sents)
		word_list=nltk.word_tokenize(line)
		line_tagged=tagger.tag(word_list)
		if index in range(5000,60001,5000):
			print index
		# print line_tagged
		for t in line_tagged:
			output.write('_'.join(t)+' ')
		output.write('\n')


# # WAY-3: More precise but slower  21s/1000
# tagger = PerceptronTagger() 
# with open(input_filename) as data_file:
# 	for (index,line) in enumerate(data_file):
# 		line=line.decode('utf-8','ignore') # you have to decode the line using the corresponded coding!
# 		sents = nltk.sent_tokenize(line)
# 		tokenized_sents = [nltk.word_tokenize(i) for i in sents]
                        test_tokens = []
                        test_labels = []
                        dummy_label = "Dummy"
                        last_label = ""
                        if prev_token: 
                            test_tokens.append(prev_token)
                            last_label = dummy_label
                            test_labels.append(last_label)
                        for pjtk in  projection_tokens:
                            test_tokens.append(pjtk)
                            test_labels.append("None")
                        if next_token:
                            test_tokens.append(next_token)
                            test_labels.append(dummy_label)

                        test_pos_tags = tagger.tag(test_tokens)
                        tagged_text = [tpt + (test_labels[i],) for i, tpt in enumerate(test_pos_tags)]
                        if debug and False:
                            print >> std.stderr, "Tagged projection", tagged_text

                        if extra_features:
                            X_test = kpc.sent2features_extra(tagged_text, qr)
                        else:
                            X_test = kpc.sent2features(tagged_text)

                        is_not_kp = "None"
                        tmp_label = is_not_kp
                        new_kp = []
                        new_list_labels = []

                        X_labeled = crftagger.tag(X_test)
Ejemplo n.º 26
0
class NLP:
    def __init__(self, nodes):
        self.mwe_tokenizer = MWETokenizer(self._build_mwe(nodes.nodelist))
        self.lemmatizer = WordNetLemmatizer()
        self.tagger = PerceptronTagger()
        self.stop_words = set(stopwords.words("english"))

    @staticmethod
    def _build_mwe(nodelist):
        """Builds multi word expressions based on synonyms of nodes in nodelist.

        Parameters
        ----------
        nodelist: list(Node)

        Returns
        -------
        multi_word_expressions : list(str)
        """

        multi_word_expressions = []
        # iterate through synonyms in nodelist and append joined n-grams
        # (separator = "_") to above list.
        for node in nodelist:
            for idx in range(len(node.synonyms)):
                mwe = [word_tokenize(node.synonyms[idx].lower())]
                multi_word_expressions.append(tuple(mwe[0]))
        return multi_word_expressions

    def tokenize_into_words(self, text_input):
        """Splits strings into list of words with regard to multi word expressions
        generated from nodelist.

        Parameters
        ----------
        text_input : str

        Returns
        -------
        tokenized_string : list(str)
        """
        tokenized_string = self.mwe_tokenizer.tokenize(
            word_tokenize(text_input.lower()))
        return tokenized_string

    @staticmethod
    def tokenize_into_sentences(text_input):
        """Splits strings into list of sentences.

        Parameters
        ----------
        text_input: str

        Returns
        -------
        tokenized_string : list(str)

        """

        tokenized_string = sent_tokenize(text_input)
        return tokenized_string

    @staticmethod
    def get_wordnet_pos(treebank_tag):
        """Used to translate part of speech tags (wordnet vs. nltk).

        Parameters
        ----------
        treebank_tag : str

        Returns
        -------
        wordnet.* : str

        References
        -------
        https://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python

        """

        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  # Alternatively: "" or None.

    def pos_tag_words(self, text_input):
        """Assigns part of speech tags to tokenized words.

        Parameters
        ----------
        text_input : str

        Returns
        -------
        pos_tagged_words : list(str)
        """

        sentences = sent_tokenize(text_input)
        pos_tagged_words = []
        for sentence in sentences:
            sentence_pos = self.tagger.tag(
                self.mwe_tokenizer.tokenize(word_tokenize(sentence)))
            for t in sentence_pos:
                pos_tagged_words.append(t)
        return pos_tagged_words

    def lemmatize_words(self, text_input):
        """Identifies the lemma of each word based on specified lemmatizer
        and part of speech tagger.

        Parameters
        ----------
        text_input : str

        Returns
        -------
        lemmatized_words : list(str)
        """

        lemmatized_words = []
        for w in self.pos_tag_words(text_input):
            if w[0] not in string.punctuation:
                lemmatized_word = self.lemmatizer.lemmatize(
                    word=w[0], pos=self.get_wordnet_pos(w[1]))
                lemmatized_words.append(lemmatized_word.lower())
        return lemmatized_words

    def list_based_stopword_removal(self, text_input):
        """Removes stopwords based on specified stopword list.

        Parameters
        ----------
        text_input : list(str)

        Returns
        -------
        filtered_words . list(str)
        """

        filtered_words = []
        for word in text_input:
            if word not in self.stop_words:
                filtered_words.append(word)
        return filtered_words

    def list_based_stopword_removal_for_pos_tagged_words(self, text_input):
        """Removes stopwords based on specified stopword list
        (specified for pos_tagged_words)

        Parameters
        ----------
        text_input : str

        Returns
        -------
        filtered_words : list(str)
        """

        filtered_words = []
        for word in text_input:
            if word[0] not in self.stop_words:
                filtered_words.append(word)
        return filtered_words
Ejemplo n.º 27
0
class HearstPatterns(object):

	def __init__(self):
		self.__chunk_patterns = r""" #  helps us find noun phrase chunks
 				NP: {<DT|PP\$>?<JJ>*<NN>+}
 					{<NNP>+}
 					{<NNS>+}
		"""

		self.__np_chunker = nltk.RegexpParser(self.__chunk_patterns) # create a chunk parser 

		# now define the Hearst patterns
		# format is <hearst-pattern>, <general-term>
		# so, what this means is that if you apply the first pattern, the firsr Noun Phrase (NP)
		# is the general one, and the rest are specific NPs
		self.__hearst_patterns = [
			("(NP_\w+ (, )?such as (NP_\w+ ? (, )?(and |or )?)+)", "first"),
			("(such NP_\w+ (, )?as (NP_\w+ ?(, )?(and |or )?)+)", "first"),
			("((NP_\w+ ?(, )?)+(and |or )?other NP_\w+)", "last"),
			("(NP_\w+ (, )?including (NP_\w+ ?(, )?(and |or )?)+)", "first"),
			("(NP_\w+ (, )?especially (NP_\w+ ?(, )?(and |or )?)+)", "first"),
		]

		self.__pos_tagger = PerceptronTagger()
		
	def prepare(self, rawtext):
		sentences = nltk.sent_tokenize(rawtext.strip()) # NLTK default sentence segmenter
		sentences = [nltk.word_tokenize(sent) for sent in sentences] # NLTK word tokenizer
		sentences = [self.__pos_tagger.tag(sent) for sent in sentences] # NLTK POS tagger

		return sentences

	def chunk(self, rawtext):
		sentences = self.prepare(rawtext.strip())

		all_chunks = []
		for sentence in sentences:
			chunks = self.__np_chunker.parse(sentence) # parse the example sentence
			#for chunk in chunks:
			#	print str(chunk)
			all_chunks.append(self.prepare_chunks(chunks))
		return all_chunks

	def prepare_chunks(self, chunks):
		# basically, if the chunk is NP, keep it as a string taht starts w/ NP and replace " " with _
		# otherwise, keep the word.
		# remove punct
		# this is all done to make it super easy to apply the Hearst patterns...

		terms = []
		for chunk in chunks:
			label = None
			try: # gross hack to see if the chunk is simply a word or a NP, as we want. But non-NP fail on this method call
				label = chunk.label()
			except:
				pass

			if label is None: #means one word...
				token = chunk[0]
				pos = chunk[1]
				if pos in ['.', ':', '-', '_']:
					continue
				terms.append(token)
			else:
				np = "NP_"+"_".join([a[0] for a in chunk]) #This makes it easy to apply the Hearst patterns later
				terms.append(np)
		return ' '.join(terms)

	"""
		This is the main entry point for this code.
		It takes as input the rawtext to process and returns a list of tuples (specific-term, general-term)
		where each tuple represents a hypernym pair.

	"""
	def find_hyponyms(self, rawtext):

		hyponyms = []
		np_tagged_sentences = self.chunk(rawtext)

		for raw_sentence in np_tagged_sentences:
			# two or more NPs next to each other should be merged into a single NP, it's a chunk error

			# find any N consecutive NP_ and merge them into one...
			# So, something like: "NP_foo NP_bar blah blah" becomes "NP_foo_bar blah blah"
			sentence = re.sub(r"(NP_\w+ NP_\w+)+", lambda m: m.expand(r'\1').replace(" NP_", "_"), raw_sentence)

			for (hearst_pattern, parser) in self.__hearst_patterns:
				matches = re.search(hearst_pattern, sentence)
				if matches:
					match_str = matches.group(0)

					nps = [a for a in match_str.split() if a.startswith("NP_")]

					if parser == "first":
						general = nps[0]
						specifics = nps[1:]
					else:
						general = nps[-1]
						specifics = nps[:-1]

					for i in range(len(specifics)):
						hyponyms.append((self.clean_hyponym_term(specifics[i]), self.clean_hyponym_term(general)))

		return hyponyms


	def clean_hyponym_term(self, term):
		# good point to do the stemming or lemmatization
		return term.replace("NP_","").replace("_", " ")
Ejemplo n.º 28
0
class HearstPatterns(object):
    def __init__(self, extended=False):
        self.__chunk_patterns = r""" #  helps us find noun phrase chunks
                NP: {<DT|PP\$>?<JJ>*<NN>+}
                    {<NNP>+}
                    {<NNS>+}
        """

        self.__np_chunker = nltk.RegexpParser(
            self.__chunk_patterns)  # create a chunk parser

        # now define the Hearst patterns
        # format is <hearst-pattern>, <general-term>
        # so, what this means is that if you apply the first pattern, the firsr Noun Phrase (NP)
        # is the general one, and the rest are specific NPs
        self.__hearst_patterns = [
            ("(NP_\w+ (, )?such as (NP_\w+ ? (, )?(and |or )?)+)", "first"),
            ("(such NP_\w+ (, )?as (NP_\w+ ?(, )?(and |or )?)+)", "first"),
            ("((NP_\w+ ?(, )?)+(and |or )?other NP_\w+)", "last"),
            ("(NP_\w+ (, )?including (NP_\w+ ?(, )?(and |or )?)+)", "first"),
            ("(NP_\w+ (, )?especially (NP_\w+ ?(, )?(and |or )?)+)", "first"),
        ]

        if extended:
            self.__hearst_patterns.extend([
                ("((NP_\w+ ?(, )?)+(and |or )?any other NP_\w+)", "last"),
                ("((NP_\w+ ?(, )?)+(and |or )?some other NP_\w+)", "last"),
                ("((NP_\w+ ?(, )?)+(and |or )?is a NP_\w+)", "last"),
                ("((NP_\w+ ?(, )?)+(and |or )?was a NP_\w+)", "last"),
                ("((NP_\w+ ?(, )?)+(and |or )?were a NP_\w+)", "last"),
                ("((NP_\w+ ?(, )?)+(and |or )?are a NP_\w+)", "last"),
                ("(NP_\w+ (, )?like (NP_\w+ ? (, )?(and |or )?)+)", "first"),
                ("such (NP_\w+ (, )?as (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("((NP_\w+ ?(, )?)+(and |or )?like other NP_\w+)", "last"),
                ("((NP_\w+ ?(, )?)+(and |or )?one of the NP_\w+)", "last"),
                ("((NP_\w+ ?(, )?)+(and |or )?one of these NP_\w+)", "last"),
                ("((NP_\w+ ?(, )?)+(and |or )?one of those NP_\w+)", "last"),
                ("examples of (NP_\w+ (, )?is (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("examples of (NP_\w+ (, )?are (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("((NP_\w+ ?(, )?)+(and |or )?are examples of NP_\w+)",
                 "last"),
                ("((NP_\w+ ?(, )?)+(and |or )?is example of NP_\w+)", "last"),
                ("(NP_\w+ (, )?for example (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("((NP_\w+ ?(, )?)+(and |or )?wich is called NP_\w+)", "last"),
                ("((NP_\w+ ?(, )?)+(and |or )?which is named NP_\w+)", "last"),
                ("(NP_\w+ (, )?mainly (NP_\w+ ? (, )?(and |or )?)+)", "first"),
                ("(NP_\w+ (, )?mostly (NP_\w+ ? (, )?(and |or )?)+)", "first"),
                ("(NP_\w+ (, )?notably (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("(NP_\w+ (, )?particularly (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("(NP_\w+ (, )?principally (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("(NP_\w+ (, )?in particular (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("(NP_\w+ (, )?except (NP_\w+ ? (, )?(and |or )?)+)", "first"),
                ("(NP_\w+ (, )?other than (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("(NP_\w+ (, )?e.g. (NP_\w+ ? (, )?(and |or )?)+)", "first"),
                ("(NP_\w+ (, )?i.e. (NP_\w+ ? (, )?(and |or )?)+)", "first"),
                ("((NP_\w+ ?(, )?)+(and |or )?a kind of NP_\w+)", "last"),
                ("((NP_\w+ ?(, )?)+(and |or )?kinds of NP_\w+)", "last"),
                ("((NP_\w+ ?(, )?)+(and |or )?form of NP_\w+)", "last"),
                ("((NP_\w+ ?(, )?)+(and |or )?forms of NP_\w+)", "last"),
                ("((NP_\w+ ?(, )?)+(and |or )?which looks like NP_\w+)",
                 "last"),
                ("((NP_\w+ ?(, )?)+(and |or )?which sounds like NP_\w+)",
                 "last"),
                ("(NP_\w+ (, )?which are similar to (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("(NP_\w+ (, )?which is similar to (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("(NP_\w+ (, )?examples of this is (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("(NP_\w+ (, )?examples of this are (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("(NP_\w+ (, )?types (NP_\w+ ? (, )?(and |or )?)+)", "first"),
                ("((NP_\w+ ?(, )?)+(and |or )? NP_\w+ types)", "last"),
                ("(NP_\w+ (, )?whether (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("(compare (NP_\w+ ?(, )?)+(and |or )?with NP_\w+)", "last"),
                ("(NP_\w+ (, )?compared to (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("(NP_\w+ (, )?among them (NP_\w+ ? (, )?(and |or )?)+)",
                 "first"),
                ("((NP_\w+ ?(, )?)+(and |or )?as NP_\w+)", "last"),
                ("(NP_\w+ (, )? (NP_\w+ ? (, )?(and |or )?)+ for instance)",
                 "first"),
                ("((NP_\w+ ?(, )?)+(and |or )?sort of NP_\w+)", "last"),
            ])

        self.__pos_tagger = PerceptronTagger()

    def prepare(self, rawtext):
        sentences = nltk.sent_tokenize(
            rawtext.strip())  # NLTK default sentence segmenter
        sentences = [nltk.word_tokenize(sent)
                     for sent in sentences]  # NLTK word tokenizer
        sentences = [self.__pos_tagger.tag(sent)
                     for sent in sentences]  # NLTK POS tagger

        return sentences

    def chunk(self, rawtext):
        sentences = self.prepare(rawtext.strip())

        all_chunks = []
        for sentence in sentences:
            chunks = self.__np_chunker.parse(
                sentence)  # parse the example sentence
            #for chunk in chunks:
            #   print(str(chunk))
            all_chunks.append(self.prepare_chunks(chunks))
        return all_chunks

    def prepare_chunks(self, chunks):
        # basically, if the chunk is NP, keep it as a string taht starts w/ NP and replace " " with _
        # otherwise, keep the word.
        # remove punct
        # this is all done to make it super easy to apply the Hearst patterns...

        terms = []
        for chunk in chunks:
            label = None
            try:  # gross hack to see if the chunk is simply a word or a NP, as we want. But non-NP fail on this method call
                label = chunk.label()
            except:
                pass

            if label is None:  #means one word...
                token = chunk[0]
                pos = chunk[1]
                if pos in ['.', ':', '-', '_']:
                    continue
                terms.append(token)
            else:
                np = "NP_" + "_".join([
                    a[0] for a in chunk
                ])  #This makes it easy to apply the Hearst patterns later
                terms.append(np)
        return ' '.join(terms)

    def replace_np_sequences(self, sentence):
        words = ""
        first_word_in_sequence = False
        for word in nltk.word_tokenize(sentence.replace("NP_", "_")):
            if word[0] == "_":
                if not first_word_in_sequence:
                    word = "NP" + word
                    first_word_in_sequence = True
                    words = words + " " + word
                else:
                    words += word
            else:
                words = words + " " + word
                first_word_in_sequence = False
        return words.strip()

    """
        This is the main entry point for this code.
        It takes as input the rawtext to process and returns a list of tuples (specific-term, general-term)
        where each tuple represents a hypernym pair.

    """

    def find_hyponyms(self, rawtext):

        hyponyms = []
        np_tagged_sentences = self.chunk(rawtext)
        #print "NP tagged-->",np_tagged_sentences

        for raw_sentence in np_tagged_sentences:
            # two or more NPs next to each other should be merged into a single NP, it's a chunk error

            # find any N consecutive NP_ and merge them into one...
            # So, something like: "NP_foo NP_bar blah blah" becomes "NP_foo_bar blah blah"
            sentence = self.replace_np_sequences(raw_sentence)

            for (hearst_pattern, parser) in self.__hearst_patterns:
                matches = re.search(hearst_pattern, sentence)
                if matches:
                    match_str = matches.group(0)

                    nps = [a for a in match_str.split() if a.startswith("NP_")]

                    if parser == "first":
                        general = nps[0]
                        specifics = nps[1:]
                    else:
                        general = nps[-1]
                        specifics = nps[:-1]
                        #print(str(general))
                        #print(str(nps))

                    for i in range(len(specifics)):
                        #print("%s, %s" % (specifics[i], general))
                        hs = self.clean_hyponym_term(
                            specifics[i]) + "-" + self.clean_hyponym_term(
                                general)
                        hyponyms.append(hs)
                        # hyponyms.append((self.clean_hyponym_term(general),self.clean_hyponym_term(general)))
                        # return hyponyms
        return hyponyms

    # from pyspark.sql.functions import udf
    # from pyspark.sql.types import ArrayType, StringType

    # find_hyponyms_udf = udf(lambda x: find_hyponyms(x), ArrayType(StringType()))

    def clean_hyponym_term(self, term):
        # good point to do the stemming or lemmatization
        return term.replace("NP_", "").replace("_", " ")
class ActionDetection(object):
    def __init__(self, train=False):
        self.tagger = PerceptronTagger()
        self.model = None
        # BOW: triangle, rectangle, circle, hand
        # verbs: draw, wave, rotate
        self.BOW = ['draw', 'wave', 'rotate', 'triangle', 'rectangle', 'circle', 'hand']
        self.VERBS = [wn.synset('draw.v.01'), wn.synset('wave.v.01'), wn.synset('rotate.v.01')]
        self.n_bow, self.n_verbs = len(self.BOW), len(self.VERBS)
        if train: self.train_svm()
        else: self.load_model()
        return

    def save_model(self):
        f = open(MODEL_PATH + 'action_detection.model', 'wb')
        pickle.dump(self.model, f)
        f.close()
        return

    def train_svm(self):
        with open(DATA_PATH+'action_detection_training_set.txt') as f:
            data = f.readlines()
        X, y = [],[]
        for line in data:
            line = line.strip()
            if not line: continue
            line = line.split(' ',1)
            X.append(self.extract_feature(line[1]))
            y.append(int(line[0]))
        lin_clf = svm.LinearSVC()
        lin_clf.fit(X, y)
        self.model = lin_clf
        self.save_model()
        return

    def load_model(self):
        f = open(MODEL_PATH + 'action_detection.model', 'rb')
        self.model = pickle.load(f)
        f.close()
        return

    def extract_feature(self, sent):
        feature = [0] * (self.n_bow+self.n_verbs)
        verbs = [ w for w,pos in self.tagger.tag(word_tokenize(sent)) if pos=='VB' ]
        words = set(sent.split())
        for i in xrange(self.n_bow):
            feature[i] = 1 if self.BOW[i] in words else 0
        for i in xrange(self.n_verbs):
            if not verbs:
                feature[self.n_bow+i] = 0
            else:
                similarities = [ wn.path_similarity(self.VERBS[i],wn.synset(v+'.v.01')) for v in verbs ]
                feature[self.n_bow+i] = max(similarities)
        return feature

    def predict(self, sent):
        # classes: 0(rectangle), 1(circle), 2(triangle), 3(wave), 4(rotate)
        feature = self.extract_feature(sent)
        idx = self.model.predict([feature])[0]
        probs = self.model._predict_proba_lr([feature])[0]
        # return value: 0(none), 1-5(classes+1)
        if probs[idx]>CONFIDENCE_THRESHOLD: return idx+1
        else: return 0
Ejemplo n.º 30
0
import nltk
from nltk.tag.perceptron import PerceptronTagger

# nltk.download()

sentence = """At eight o'clock on Thursday morning... Arthur didn't feel very good."""
tokens = nltk.word_tokenize(sentence)

tagger = PerceptronTagger(False)
tagger.load('file:///C:/Users/yarov/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle')
tagged = tagger.tag(tokens)
print tagged
Ejemplo n.º 31
0
class TermExtractor:
    def __init__(self, stopwords=[], term_patterns=[], min_term_length=3, min_term_words=2):
        #StopWordsDetector
        self.stopwords = set(stopwords)
        self.min_term_length = min_term_length
        self.term_patterns = term_patterns
        self.min_term_words = min_term_words
        self.detectors = []
        self.pos_tagger=PerceptronTagger()
        for tp in term_patterns:
            self.detectors.append(POSSequenceDetector(tp))
            
        self.swd = StopWordsDetector(self.stopwords)
        
    def extract_terms(self, doc_txt, trace=False):
        sent_tokenize_list = filter(lambda x: len(x) > 0, map(lambda s: nltk.tokenize.sent_tokenize(s), doc_txt))
        sentences = []
        _ = [sentences.extend(lst) for lst in sent_tokenize_list]
        if trace:
            print('len(sentences)=' + str(len(sentences)))

        terms = [] #pd.DataFrame(columns=['term'])
        #sentences = sentences[:30]

        i = 1
        filter_fn = lambda x: len(x) >= self.min_term_length
        max_i = len(sentences)
        for s in sentences:
            text = nltk.word_tokenize(s)
            #sent_pos_tags=nltk.pos_tag(text, tagset='universal')
            sent_pos_tags = self.pos_tagger.tag(text)
            sentence_terms = set()
            for fsa1 in self.detectors:
                stn = filter(filter_fn, [' '.join(t) for t in fsa1.detect(sent_pos_tags) if len(t) >= self.min_term_words and len(self.swd.detect(t)) == 0])
                sentence_terms.update(stn)
            terms.extend([str(trm).strip() for trm in sentence_terms])
            if trace:
                print(i, '/', max_i, s)
            i = i + 1
        return terms
    '''
    
    '''
    def c_values(self, terms, trace=False):
        terms_df = pd.DataFrame(terms, columns=['term'])
        terms_df['w'] = 1
        terms_df['len'] = len(terms_df['term'])
        term_stats = terms_df.groupby(['term'])['w'].agg([np.sum])
        term_stats['len'] = list(pd.Series(term_stats.index).apply(lambda x:len(x)))

        term_series = list(term_stats.index)
        n_terms = len(term_series)
        
        for i in range(0, n_terms):
            term_series[i]=' '+str(term_series[i])+' '

        
        term_stats['trm']=term_series
        term_stats.set_index('trm', inplace=True)

        A = ahocorasick.Automaton()
        for i in range(0, n_terms):
            A.add_word(term_series[i], (i, term_series[i]))
        A.make_automaton()

        is_part_of = []
        for i in range(0, n_terms):
            haystack=term_series[i]
            for end_index, (insert_order, original_value) in A.iter(haystack):
                if original_value!=haystack:
                    #print original_value, "insideof ", haystack
                    is_part_of.append((original_value, haystack, 1))
        subterms = pd.DataFrame(is_part_of, columns=['term', 'part_of', 'w']).set_index(['term', 'part_of'])

        if trace:
            print("terms/subterms relations discovered ...")

        c_values = []
        # term_series=['time']
        for t in term_series:
            if t in term_stats.index:
                current_term = term_stats.loc[t]
                # average frequency of the superterms
                c_value = 0
                if t in subterms.index:
                    subterm_of = list(subterms.loc[t].index)
                    for st in subterm_of:
                        c_value -= term_stats.loc[st]['sum']
                    c_value /= float(len(subterm_of))

                # add current term frequency
                c_value += current_term['sum']

                # multiply to log(term length)
                c_value = c_value * np.log(current_term['len']) if current_term['len']>0 else 0
                if trace:
                    print(t, 'freq=', current_term['sum'], ' cvalue=', c_value)
                c_values.append(c_value)
                # break

        return sorted(zip( [x.strip() for x in term_series], c_values), key=lambda x: x[1], reverse=True)
Ejemplo n.º 32
0
    # # lecture/analyse PHRASES
    sents = []
    for textli in textlines:
        sents += sent_tokenize(textli)
    del textlines

    t2 = datetime.now()
    # # lecture/analyse MOTS + CATS
    pos_sents=[]
    for s in sents:
        tok_sent = []
        wtoks = word_tokenize(s)

        # A
        pos_sents.append(tagr.tag(wtoks))

        # B
        # for w in wtoks:
        #     tok_sent += pos_tag(w)
        # pos_sents.append(tok_sent)

    del sents

    t3 = datetime.now()

    # pos_sents=
    # [[('In', 'IN'),
    #   ('the', 'DT'),
    #   ('land', 'NN'),
    #   ('of', 'IN'),
Ejemplo n.º 33
0
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tag.perceptron import PerceptronTagger
from gensim.models import Phrases
from gensim.models.phrases import Phraser

from config import dataset_path



lemmatizer = WordNetLemmatizer()
tagger = PerceptronTagger()
pos_tag = lambda x : tagger.tag([x])


preprocessing_steps = ["remove_punctuation", "lowercase", "remove_numbers", "remove_stop_words", 
                        "keep_only_nouns", "lemmatization", "remove_infrequent_words", 
                        "keep_most_frequent_words", "make_bigrams"]


tag_to_keep = ['FW', 'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS']
#tag_to_keep = ['NN', 'NNS', 'NNP', 'NNPS']

preprocessed_dict = {} # save the preprocessing of each word to avoid re-computing it

def countWordsOnTexts(df):
    wordCounter = Counter()
    for i, row in df.iterrows():
Ejemplo n.º 34
0
    from nltk.tag.perceptron import PerceptronTagger  # importing a tagger
    from nltk.tokenize import word_tokenize  # tokenizing the input
    tagger = PerceptronTagger()  # init the tagger in default mode

    for i in join:
        text_orig = i[1]
        year = i[0]
        text = text_orig
        text = nltk.word_tokenize(text)
        if keyword in text:
            ind = text.index(keyword)
            w1 = ind - 1
            w2 = ind + 1
            text = " ".join(text[w1:w2])

            for word, tag in tagger.tag(word_tokenize(text)):

                if tag == "JJ":  # check if the word is an adjective
                    # SentiWordNet
                    if len(list(swn.senti_synsets(word, "a"))) > 0:
                        synset = list(swn.senti_synsets(
                            word, "a"))[0]  # get the most likely synset
                        if synset.pos_score() > synset.neg_score():
                            positive.append([year, word, keyword, text_orig])

# counting positive by decade

decade_counts = [0] * 16
decade_words = [[]] * 16

for i in positive:
Ejemplo n.º 35
0
class LemmaWordpieceTokenizer(BertTokenizer):
    def __init__(self,
                 vocab_file,
                 do_lower_case=True,
                 never_split=None,
                 additional_special_tokens=[
                     "[JJR]", "[JJS]", "[NNS]", "[NNP]", "[NNPS]", "[RBR]",
                     "[RBS]", "[VBD]", "[VBG]", "[VBN]", "[VBP]", "[VBZ]"
                 ],
                 **kwargs):
        self.inflection_tokens = additional_special_tokens
        self.tagger = PerceptronTagger()
        super().__init__(vocab_file,
                         do_lower_case=do_lower_case,
                         never_split=never_split,
                         additional_special_tokens=additional_special_tokens,
                         **kwargs)

        self.have_inflections = {'NOUN', 'ADJ', 'VERB'}
        self.lemma_tags = {'NN', 'VB', 'JJ', 'RB', 'MD', "NNP"}
        self.do_lower_case = do_lower_case
        if do_lower_case:
            self.cased_tokenizer = BasicTokenizer(do_lower_case=False,
                                                  never_split=never_split)
        else:
            self.cased_tokenizer = self.basic_tokenizer

    def _tokenize(self, text):
        tokenized = self.cased_tokenizer.tokenize(
            text, never_split=self.all_special_tokens)
        #print(tokenized)
        ptb_pos_tagged = self.tagger.tag(tokenized)
        #print(pos_tagged)
        #print(pos_tagged)
        universal_pos_tagged = [(token, map_tag("en-ptb", 'universal', tag))
                                for (token, tag) in ptb_pos_tagged]
        #print(universal_pos_tagged)
        split_tokens = []
        for i, (word, pos) in enumerate(ptb_pos_tagged):
            if self.do_lower_case:
                word = word.lower()
            if universal_pos_tagged[i][
                    1] in self.have_inflections and word not in (
                        string.punctuation +
                        '—') and pos not in self.lemma_tags:
                # (universal_)pos_tagged in the form of [(word, pos),(word, pos),...]
                # getLemma returns a tuple (lemma,)
                lemma = getLemma(word, upos=universal_pos_tagged[i][1])[0]
                if not lemma:
                    lemma = word
                wordpieced = self.wordpiece_tokenizer.tokenize(lemma)
                #print(wordpieced)
                split_tokens.extend(wordpieced)
            else:
                wordpieced = self.wordpiece_tokenizer.tokenize(word)
                split_tokens.extend(wordpieced)
        return split_tokens

    def convert_tokens_to_string(self, tokens):
        result = []
        for i, token in enumerate(tokens):
            # combine wordpiece tokens
            if len(token) > 2 and token[:2] == '##':
                if result:
                    result[-1] += token[2:]
                else:
                    result.append(token[2:])
                continue
            if token in self.inflection_tokens:
                if i != 0:
                    inflected = getInflection(result[-1], tag=token[1:-1])
                    if inflected:
                        result[-1] = inflected[0]
            else:
                result.append(token)
        return ' '.join(result)
Ejemplo n.º 36
0
        for (dirname, _, filenames) in os.walk(dir_corpus):
            for f in filenames:
                ext = f[-4:]
                if ext == '.ann':
                    file_count += 1
                    if debug and file_count > debug_tests:
                        break
                    
                    file_text = os.path.join(dirname, f[:-4] + ".txt")
                    text_file = open(file_text, "r")
                    file_kpe = os.path.join(dir_output, f[:-4] + ".ann")
                    kpe_file = open(file_kpe, "w")

                    raw_text = unicode(text_file.read(), encoding="utf-8")
                    tokens = tokenizer.tokenize(raw_text)
                    tagged_text = [t + ("None",)  for t in tagger.tag(tokens)]
                    text_file.close()
                    #test_sents.append(tagged_text)
                    if extra_features:
                        X_test = kpc.sent2features_extra(tagged_text, qr)
                    else:
                        X_test = kpc.sent2features(tagged_text)

                    is_not_kp = "None"
                    tmp_label = is_not_kp
                    new_kp = []
                    kp_list = []
                    for kp in zip(crftagger.tag(X_test), [tt[0] for tt in tagged_text]):
                        if debug and False:
                            print >> sys.stderr, "    ---- ", kp
                        if kp[0][0:2] == "B-":
Ejemplo n.º 37
0
class HearstPatterns(object):
    def __init__(self):
        self.__chunk_patterns = r""" #  helps us find noun phrase chunks
         				NP: {<DT|PP\$>?<JJ>*<NN>+}
 					{<NNP>+}
 					{<NNS>+}
        		"""
        self.__np_chunker = nltk.RegexpParser(self.__chunk_patterns)  # create a chunk parser

        # now define the Hearst patterns
        # format is <hearst-pattern>, <general-term>
        # so, what this means is that if you apply the first pattern, the firsr Noun Phrase (NP)
        # is the general one, and the rest are specific NPs
        self.__hearst_patterns = [
            ("(NP_\w+ (, )?such as (NP_\w+ ? (, )?((and |or )NP_\w+)?)+)", "first"),  #
            ("(such NP_\w+ (, )?as (NP_\w+ ?(, )?(and |or )?)+)", "first"),
            ("((NP_\w+ ?(, )?)+(and |or )?other NP_\w+)", "last"),
            ("(NP_\w+ (, )?including (NP_\w+ ?(, )?(and |or )?)+)", "first"),
            ("(NP_\w+ (, )?especially (NP_\w+ ?(, )?(and |or )?)+)", "first"),
        ]

        self.__pos_tagger = PerceptronTagger()

    # divid text into sentences, tokenze the setnences and add par of speech tagging
    def prepare(self, rawtext):
        sentences = nltk.sent_tokenize(rawtext.strip())  # NLTK default sentence segmenter
        sentences = [nltk.word_tokenize(sent) for sent in sentences]  # NLTK word tokenizer
        sentences = [self.__pos_tagger.tag(sent) for sent in sentences]  # NLTK POS tagger

        return sentences

    # apply chunking step on the setences using the defined grammer for extracting nounphrases
    def chunk(self, rawtext):
        sentences = self.prepare(rawtext.strip())

        all_chunks = []
        for sentence in sentences:
            chunks = self.__np_chunker.parse(sentence)  # parse the example sentence
            all_chunks.append(self.prepare_chunks(chunks))
        return all_chunks

    # annotate the np with a prefiy NP_ also exclude the other and such from NP to be used in the patterns
    def prepare_chunks(self, chunks):
        # basically, if the chunk is NP, keep it as a string that starts w/ NP and replace " " with _
        # otherwise, keep the word.
        # remove punct
        # this is all done to make it super easy to apply the Hearst patterns...

        terms = []
        for chunk in chunks:
            label = None
            try:  # gross hack to see if the chunk is simply a word or a NP, as we want. But non-NP fail on this method call
                label = chunk.label()
            except:
                pass

            if label is None:  # means one word...
                token = chunk[0]
                pos = chunk[1]
                if pos in ['.', ':', '-', '_']:
                    continue
                terms.append(token)
            else:
                if chunk[0][0]=='such':
                    np = "such NP_" + "_".join([a[0] for a in chunk[1:]])
                elif chunk[0][0]=='other':
                    np = "other NP_" + "_".join([a[0] for a in chunk[1:]])
                else:
                    np = "NP_" + "_".join([a[0] for a in chunk])  # This makes it easy to apply the Hearst patterns later
                terms.append(np)
        return ' '.join(terms)

    # main method for extracting hyponym relations based on hearst patterns
    def find_hyponyms(self, folderpath,stopWord):
        all_sentences=list()
        hyponyms = []

        filelist = os.listdir(folderpath)
        for filePath in filelist:
            print ("processing file ","........................",filePath)
            file = codecs.open(folderpath + "//" + filePath, "r", encoding='utf-8', errors='ignore')
            lines = file.readlines()
            rawtext=(''.join(lines))
            rawtext=rawtext.lower()
            np_tagged_sentences = self.chunk(rawtext)
            for raw_sentence in np_tagged_sentences:
                # two or more NPs next to each other should be merged into a single NP, it's a chunk error
                # find any N consecutive NP_ and merge them into one...
                # So, something like: "NP_foo NP_bar blah blah" becomes "NP_foo_bar blah blah"
                sentence = re.sub(r"(NP_\w+ NP_\w+)+", lambda m: m.expand(r'\1').replace(" NP_", "_"), raw_sentence)
                # print  sentence
                for (hearst_pattern, parser) in self.__hearst_patterns:
                    matches = re.search(hearst_pattern, sentence)

                    if matches:
                        match_str = matches.group(1)
                        nps = [a for a in match_str.split() if a.startswith("NP_")]
                        if parser == "first":
                            general = nps[0]
                            specifics = nps[1:]
                        else:
                            general = nps[-1]
                            specifics = nps[:-1]
                        for i in range(len(specifics)):
                            if i==0:
                                e2=general.replace("NP_", "").replace("_", " ")
                                e1=specifics[0].replace("NP_", "").replace("_", " ")
                                clean_sen=sentence.replace("NP_", "").replace("_", " ")
                                clean_sen=clean_sen.replace(e1,"<e1>"+e1+"</e1>").replace(e2,"<e2>"+e2+"</e2>")
                                all_sentences.append(clean_sen.replace("NP_", "").replace("_", " "))
                                #print(clean_sen)
                            #print(general, specifics[i])
                            hyponyms.append(( self.clean_hyponym_term(general),self.clean_hyponym_term(specifics[i])))
            file.close()

        return self.refine_hyponym_term(hyponyms,stopWord),all_sentences

    def clean_hyponym_term(self, term):
        return term.replace("NP_", "").replace("_", " ")

    # remove stopwprds and sniguralize specfic and general concepts
    def refine_hyponym_term(self,hyponyms,stopWord):
        cleanedHyponyms=[]
        with open(stopWord) as f:
            stopWords = f.read().splitlines()

        for hyponym in hyponyms:
            #print(hyponym)
            specific = ' '.join([i for i in hyponym[1].split(' ') if not any(w == i.lower() for w in stopWords)])
            general = ' '.join([i for i in hyponym[0].split(' ') if not any(w == i.lower() for w in stopWords)])
            if specific == '' or general=='':
                print ('skipped relation: ', hyponym[1], 'is a ', hyponym[0])
                continue
            cleanedHyponyms.append((singularize(general) ,singularize(specific)))

        cleanedHyponyms.sort()
        return self.remove_duplicates(cleanedHyponyms)

    # remove duplicates in hyonym list
    def remove_duplicates(self, seq):
        seen = set()
        seen_add = seen.add
        return [x for x in seq if not (x in seen or seen_add(x))]
Ejemplo n.º 38
0
class BITETokenizer(object):
    inflection_tokens = [
        "[JJR]", "[JJS]", "[NNS]", "[NNPS]", "[RBR]", "[RBS]", "[VBD]",
        "[VBG]", "[VBN]", "[VBP]", "[VBZ]"
    ]
    single_char_map = {
        "[JJR]": chr(9774),
        "[JJS]": chr(9775),
        "[NNS]": chr(9776),
        "[NNPS]": chr(9777),
        "[RBR]": chr(9778),
        "[RBS]": chr(9779),
        "[VBD]": chr(9780),
        "[VBG]": chr(9781),
        "[VBN]": chr(9782),
        "[VBP]": chr(9783),
        "[VBZ]": chr(9784)
    }
    reverse_single_char_map = {v: k for k, v in single_char_map.items()}
    lemma_tags = {'NN', 'VB', 'JJ', 'RB', 'MD', "NNP"}
    have_inflections = {'NOUN', 'ADJ', 'VERB'}

    def __init__(self, pretokenizer='moses'):
        self.tagger = PerceptronTagger()
        self.pretok_type = pretokenizer
        if pretokenizer == 'bertpretokenizer':
            self.pretokenizer = BertPreTokenizer()
        elif pretokenizer == 'moses':
            self.pretokenizer = MosesTokenizer()
            self.detokenizer = MosesDetokenizer()
        elif pretokenizer == 'whitespace':
            pass
        else:
            raise ValueError(
                "pretokenizer must be 'bertpretokenizer', 'moses', or 'whitespace'."
            )

    def _pretokenize(self, sentence: str) -> List[str]:
        if self.pretok_type == 'bertpretokenizer':
            return [tup[0] for tup in self.pretokenizer.pre_tokenize(sentence)]
        elif self.pretok_type == 'whitespace':
            return sentence.split()
        else:
            return self.pretokenizer.tokenize(sentence)

    def tokenize(self,
                 sentence: Union[str, List[str]],
                 pretokenize: bool = True,
                 map_to_single_char: bool = False) -> List[str]:
        if pretokenize:
            pretokenized = self._pretokenize(sentence)
        else:
            # Allow users to pass in a list of tokens if using custom pretokenizers
            pretokenized = sentence
        ptb_pos_tagged = self.tagger.tag(pretokenized)
        universal_pos_tagged = [(token, map_tag("en-ptb", 'universal', tag))
                                for (token, tag) in ptb_pos_tagged]
        tokenized = []
        for i, (word, pos) in enumerate(ptb_pos_tagged):
            if universal_pos_tagged[i][
                    1] in self.have_inflections and word not in (
                        string.punctuation +
                        '—') and pos not in self.lemma_tags:
                lemma = getLemma(word, upos=universal_pos_tagged[i][1])[0]
                if not lemma:
                    lemma = word
                tokenized.append(lemma)
                tokenized.append('[' + pos + ']')
            else:
                tokenized.append(word)
        if map_to_single_char:
            tokenized = [
                self.single_char_map[token]
                if token in self.inflection_tokens else token
                for token in tokenized
            ]
        return tokenized

    def detokenize(self,
                   tokens: List[str],
                   as_list: bool = False) -> Union[str, List[str]]:
        result = []
        for i, token in enumerate(tokens):
            # combine wordpiece tokens
            if token in self.reverse_single_char_map:
                token = self.reverse_single_char_map[token]
            if token in self.inflection_tokens:
                if i != 0:
                    inflected = getInflection(result[-1], tag=token[1:-1])
                    if inflected:
                        result[-1] = inflected[0]
            else:
                result.append(token)

        if as_list:
            # Allow users to detokenize using their own detokenizers
            return result
        if self.pretok_type == 'moses':
            return self.detokenizer.detokenize(result)
        return ' '.join(result)
Ejemplo n.º 39
0
                        ann = unicode(ann, encoding="utf-8")
                        if ann[0] not in ["R", "*"]:
                            ann_items = ann.strip().split("\t")
                            if ann_items[1].find(";") >= 0:
                                type_indexes_tmp = ann_items[1].split(" ")
                                type_indexes = type_indexes_tmp[0:2] + type_indexes_tmp[3:]
                            else:
                                type_indexes = ann_items[1].split(" ")
                            type_indexes[1] = int(type_indexes[1])
                            type_indexes[2] = int(type_indexes[2])
                            indexes_kp_tmp.setdefault(type_indexes[1], -1)
                            if indexes_kp_tmp[type_indexes[1]] < type_indexes[2]:
                                indexes_kp_tmp[type_indexes[1]] = type_indexes[2]
                            ann_text = ann_items[2]
                            tokens = tokenizer.tokenize(ann_text)
                            pos_tags = " ".join([pos[1] for pos in tagger.tag(tokens)])
                            ann_text = ann_text.encode("utf-8")
                            print >> ann_ext_file, " ".join([str(ti) for ti in type_indexes]) + "\t" + ann_text + "\t" + pos_tags
                    ann_file.close()
                    ann_ext_file.close()

                    file_text = os.path.join(dirname, f[:-4] + ".txt")
                    text_file = open(file_text, "r")
                    file_nokp = os.path.join(dir_output, f[:-4] + ".nann")
                    nokp_file = open(file_nokp, "w")

                    raw_text = unicode(text_file.read(), encoding="utf-8")

                    indexes_kp_tmp = sorted(indexes_kp_tmp.items(), key=operator.itemgetter(0,1))
                    indexes_kp = []
                    ikp_tmp = (-1, -1)
Ejemplo n.º 40
0
class TermExtractor:
    def __init__(self, stopwords=[], term_patterns=[], min_term_length=3, min_term_words=2):
        #StopWordsDetector
        self.stopwords = set(stopwords)
        self.min_term_length = min_term_length
        self.term_patterns = term_patterns
        self.min_term_words = min_term_words
        self.detectors = []
        self.pos_tagger=PerceptronTagger()
        for tp in term_patterns:
            self.detectors.append(POSSequenceDetector(tp))
            
        self.swd = StopWordsDetector(self.stopwords)
        
    def extract_terms(self, doc_txt, trace=False):
        sent_tokenize_list = filter(lambda x: len(x) > 0, map(lambda s: nltk.tokenize.sent_tokenize(s), doc_txt))
        sentences = []
        _ = [sentences.extend(lst) for lst in sent_tokenize_list]
        
        terms = [] #pd.DataFrame(columns=['term'])
        #sentences = sentences[:30]

        i = 1
        filter_fn = lambda x: len(x) >= self.min_term_length
        max_i = len(sentences)
        for s in sentences:
            text = nltk.word_tokenize(s)
            #sent_pos_tags=nltk.pos_tag(text, tagset='universal')
            sent_pos_tags = self.pos_tagger.tag(text)
            sentence_terms = set()
            for fsa1 in self.detectors:
                stn = filter(filter_fn, [' '.join(t) for t in fsa1.detect(sent_pos_tags) if len(t) >= self.min_term_words and len(self.swd.detect(t)) == 0])
                sentence_terms.update(stn)
            terms.extend(sentence_terms)
            if trace:
                print(i, '/', max_i, s)
            i = i + 1
        return terms
    '''
    
    '''
    def c_values(self, terms, trace=False):
        terms_df = pd.DataFrame(terms, columns=['term'])
        terms_df['w'] = 1
        terms_df['len'] = len(terms_df['term'])
        term_stats = terms_df.groupby(['term'])['w'].agg([np.sum])
        term_stats['len'] = list(pd.Series(term_stats.index).apply(lambda x:len(x)))
        term_stats.sort_values(by=['len'], ascending=True, inplace=True)


        term_series = list(term_stats.index)

        vectorizer = CountVectorizer(analyzer='word')
        vectorizer.fit(term_series)
        term_vectors = vectorizer.transform(term_series)

        n_terms = len(term_series)

        is_part_of = []

        for i in range(0, n_terms):
            for j in range(i + 1, n_terms):
                if scipy.spatial.distance.cosine(term_vectors[i].toarray(), term_vectors[j].toarray()) < 1:
                    # i may be inside j
                    if term_series[j].find(term_series[i]) >= 0:
                        # i is inside j
                        if trace:
                            print term_series[i], " -- ",term_series[j]
                        is_part_of.append((term_series[i], term_series[j], 1))
        subterms = pd.DataFrame(is_part_of, columns=['term', 'part_of', 'w']).set_index(['term', 'part_of'])
        
        if trace:
            print "terms/subterms relations discovered ..."

        c_values = []
        # term_series=['time']
        for t in term_series:
            # print t
            current_term = term_stats.loc[t]
            # average frequency of the superterms
            c_value = 0
            if t in subterms.index:
                subterm_of = list(subterms.loc[t].index)
                for st in subterm_of:
                    c_value -= term_stats.loc[st]['sum']
                c_value /= float(len(subterm_of))

            # add current term frequency
            c_value += current_term['sum']

            # multiply to log(term length)
            c_value = c_value * np.log(current_term['len'])
            if trace:
            	print(t, 'freq=', current_term['sum'], ' cvalue=', c_value)
            c_values.append(c_value)
            # break

        return sorted(zip(term_series, c_values), key=lambda x: x[1], reverse=True)
Ejemplo n.º 41
0
class WN_Linker():
    def __init__(self, w2v, stopwords=None, lemmatizer=None):
        if stopwords == None:
            stopwords = nltk.corpus.stopwords.words('english')
        self.w2v,self.stopwords,self.unfound_words,self.unfound_defs,self.no_synsets = w2v,stopwords,[],[],[]
        #Define map from Penn Treebank tags to WN tags
        from nltk.tag.perceptron import PerceptronTagger
        self.tagger = PerceptronTagger()
        pt_pss = self.tagger.classes
        self.pos_map = {
            k: 'a' if k.startswith('J') else
            k[0].lower() if k[0] in ['N', 'V', 'R'] else None
            for k in pt_pss
        }
        self.pos_map[None] = None
        self.lemmatizer = lemmatizer if lemmatizer else WordNetLemmatizer()
        try:
            self.lemmatizer.lemmatize('rowing')
        except LookupError:
            nltk.download('wordnet')
            self.lemmatizer.lemmatize('rowing')

    def convert_to_uni_tag(self, token):
        return '_'.join(
            [token[0], nltk.map_tag('en-ptb', 'universal', token[1])])

    def tokenize(self, s):
        return [w for w in nltk.word_tokenize(s) if w not in self.stopwords]

    def tokenize_and_tag(self, sentence):
        return [
            self.convert_to_uni_tag(token)
            for token in nltk.pos_tag(nltk.word_tokenize(sentence))
            if token[0] not in self.stopwords
        ]

    def wv_similarity(self, w1, w2):
        return self.w2v.similarity(w1, w2)

    def wv_n_similarity(self, s1, s2):
        if not (len(self.tokenize_and_tag(s1))
                and len(self.tokenize_and_tag(s2))):
            return 0.
        try:
            return self.w2v.n_similarity(self.tokenize_and_tag(s1),
                                         self.tokenize_and_tag(s2))
        except KeyError:
            s1vecs = []
            s2vecs = []
            for w in self.tokenize_and_tag(s1):
                try:
                    s1vecs.append(self.w2v[w])
                except KeyError:
                    self.unfound_words.append(w)
            for w in self.tokenize_and_tag(s2):
                try:
                    s2vecs.append(self.w2v[w])
                except KeyError:
                    self.unfound_words.append(w)
            if not (len(s1vecs) and len(s2vecs)):
                self.unfound_defs.append(s2)
                return 0.
            return dot(matutils.unitvec(array(s1vecs).mean(axis=0)),
                       matutils.unitvec(array(s2vecs).mean(axis=0)))

    def get_vecs_for_BOW(self, bag):
        try:
            return [self.w2v[word] for word in self.tokenize_and_tag(bag)]
        except KeyError:
            vecs = []
            for w in self.tokenize_and_tag(bag):
                try:
                    vecs.append(self.w2v[w])
                except KeyError:
                    self.unfound_words.append(w)
            if vecs == []: return [np.zeros(300)]
            return vecs

    def compute_similarity(self, word_context, synset):
        return self.wv_n_similarity(word_context, synset.definition())

    def link_word_to_wn(self, word, context, pos=None, context_as_vec=False):
        orig_synsets = wordnet.synsets(word)
        synsets = [ss for ss in orig_synsets
                   if ss.pos() == pos] if pos else orig_synsets
        if len(synsets) == 0: synsets = orig_synsets
        #sims = softmax([self.compute_similarity(context,ss) for ss in synsets])
        if len(synsets) == 0:
            #set_trace()
            #print('no synsets for',word)
            self.no_synsets.append(word)
            return None
        if len(synsets) == 1: return synsets[0]
        synsets = synsets[:5]  # Ignore rare senses in highly polysemous words
        if context_as_vec:
            synsets_vecs = [
                self.get_vecs_for_BOW(ss.definition()) for ss in synsets
            ]
            synsets_vecs = [
                array(vecs).mean(0) for vecs in synsets_vecs if vecs
            ]
            synsets_vecs = [normalize_vec(vec) for vec in synsets_vecs]
            sims = np.matmul(array(synsets_vecs), context)
        else:
            sims = [self.compute_similarity(context, ss) for ss in synsets]
        _, sense = max(zip(sims, synsets))
        return sense

    def lemmatize(self, w):
        return self.lemmatizer.lemmatize(w)

    def get_synsets_of_rule_parse(self, dp, use_offset=True, convert=False):
        try:
            context = ' '.join(dp['captions'])
        except TypeError:  # Nan is in list
            context = ' '.join(
                [item for item in dp['captions'] if isinstance(item, str)])
        tokens = self.tokenize(context)
        vecs = self.get_vecs_for_BOW(context)
        context_vec = normalize_vec(array(vecs).mean(axis=0))
        context_vec = matutils.unitvec(context_vec)
        pos_tagged_context_dict = {
            self.lemmatize(k): v
            for k, v in self.tagger.tag(tokens)
        }  # <token>: <pos>
        new_atoms = []
        unique_entities = set([x for atom in dp['atoms'] for x in atom])
        entity_id_dict = {}
        for entity in unique_entities:
            try:
                pt_pos = pos_tagged_context_dict[entity]
            except KeyError:
                pt_pos = self.tagger.tag([entity])[0][-1]
            pos = self.pos_map[pt_pos]
            synset = self.link_word_to_wn(entity,
                                          context_vec,
                                          context_as_vec=True,
                                          pos=pos)
            #synset = self.link_word_to_wn(entity,context,pos=pos)
            if synset is None: offset = None
            else: offset = synset.offset()
            if use_offset: entity_id_dict[entity] = offset
            else: entity_id_dict[entity] = synset
        for atom in dp['atoms']:
            new_atom = []
            for entity in atom:
                new_atom.append((entity, entity_id_dict[entity]))
            new_atoms.append(new_atom)

        return convert_logical_caption(new_atoms) if convert else new_atoms
Ejemplo n.º 42
0
class WordnetUtils():
    def __init__(self, path='../data/', win_size=4):
        self.path = path
        self.win_size = win_size
        self.create_synsets_dictionaries()
        self.create_lemma_unique_synset_dictionary()
        self.synsetid_synsetname_d = None
        self.synsetname_synsetid_d = None
        self.lemma_synsetid_d = None
        self.synsetid_lemma_d = None
        self.word_lemma_d = None
        self.tagger = PerceptronTagger()
        self.lemmatizer = WordNetLemmatizer()

    def create_synsets_dictionaries(self):
        '''
        From wordnet creates two dictionaries:
            synsetid -> synset name for ex. 2084071 -> 'dog.n.01'
            synset name -> synsetid  'dog.n.01' -> 2084071
        And pickle them to the disk
        '''
        f = '%s%s' % (self.path, synsetid_synset_name_d_filename)
        if not os.path.isfile(f):
            print("Generating synsetid to synset name dictionary")
            syns = list(wn.all_synsets())
            print('#of senses=' + str(len(syns)))
            offsets_list = [(s.offset(), s.name()) for s in syns]
            pdict = dict(offsets_list)
            pickle.dump(pdict, open(f, "wb"))
        else:
            print("Synset id to synset name dictionary already existing, skipping...")
            pdict = self.load_dict(synsetid_synset_name_d_filename)

        f = '%s%s' % (self.path, synset_name_synsetid_d_filename)
        if not os.path.isfile(f):
            print("Generating synset name to synset id dictionary")
            reverse_dictionary = dict(zip(pdict.values(), pdict.keys()))
            pickle.dump(reverse_dictionary, open(f, "wb"))
        else:
            print("Synset name to synset id dictionary already existing, skipping...")

    def create_lemma_unique_synset_dictionary(self):
        '''
        From wordnet create a dictionary which keeps track all the lemmas which are not ambiguous,
        i.e. one lemma has only one synsetid.
        And create the reverse dictionary and save it on disk.
        For example:
        pingpong_table ->  4381587
        4381587 -> pingpong_table
        '''
        f = '%s%s' % (self.path, lemma_synsetid_d_filename)
        if not os.path.isfile(f):
            print("Generating lemma to synset id dictionary")
            lemmas_in_wordnet = set(chain(*[ss.lemma_names() for ss in wn.all_synsets()]))
            dictionary = dict()
            for lemma in lemmas_in_wordnet:
                synsets = wn.synsets(lemma)
                if len(synsets) == 1:
                    print("Lemma=%s, synset=%s" % (lemma, synsets))
                    dictionary[lemma] = synsets[0].offset()
            print('Size of lemma uniqe synsets=' + str(len(dictionary)))
            pickle.dump(dictionary, open(f, "wb"))
        else:
            print("lemma to synset id dictionary already existing, skipping...")
            dictionary = self.load_dict(lemma_synsetid_d_filename)
        f = '%s%s' % (self.path, synsetid_lemma_d_filename)
        if not os.path.isfile(f):
            print("Generating synset id to lemma dictionary")
            reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
            pickle.dump(reverse_dictionary, open(f, "wb"))
        else:
            print("synset id to lemma dictionary already existing, skipping...")

    def lookup_synset_name_from_synsetid(self, synsetid):
        '''
        If not loaded, loads synsetid_synset_name dictionary
        Then given a synsetid returns its name.
        :param synsetid:
        :returns part_name
        '''
        if self.synsetid_synsetname_d == None:
            self.synsetid_synsetname_d = self.load_dict(synsetid_synset_name_d_filename)
        if synsetid in self.synsetid_synsetname_d:
            name = self.synsetid_synsetname_d[synsetid]
            if name is None:
                return synsetid
            else:
                part_name = name.split('.')[0]
                return part_name
        else:
            return synsetid

    def lookup_synsetid_from_synset_name(self, name):
        '''
        If not loaded, loads synset_name_synsetid dictionary
        Then given a synset name returns its synsetid
        :param name:
        :returns synsetid
        '''
        if self.synsetname_synsetid_d == None:
            self.synsetname_synsetid_d = self.load_dict(synset_name_synsetid_d_filename)
        if name in self.synsetname_synsetid_d:
            synsetid = self.synsetname_synsetid_d[name]
            return synsetid
        return None

    def lookup_lemma_from_synsetid(self, synsetid):
        '''
        Given a synsetid, lookup its lemma
        :param synsetid:
        :returns lemma
        '''
        if self.synsetid_lemma_d == None:
            self.synsetid_lemma_d = self.load_dict(synsetid_lemma_d_filename)
        if synsetid in self.synsetid_lemma_d:
            synsetid = self.synsetid_lemma_d[synsetid]
            return synsetid
        return None

    def lookup_synsetid_from_lemma(self, lemma):
        '''
        Given a lemma, lookup its synset id
        :param lemma:
        :returns synsetid
        '''
        if self.lemma_synsetid_d == None:
            self.lemma_synsetid_d = self.load_dict(lemma_synsetid_d_filename)
        if lemma in self.lemma_synsetid_d:
            synsetid = self.lemma_synsetid_d[lemma]
            return synsetid
        return None

    def create_word_to_lemma_dictionary(self, words):
        '''
        Given a list of words, create word_lemma dictionary in order to speed up mapping of a word to a synsetid.
        :param words:
        '''
        tags = self.tagger.tag(words)
        print(tags[:5])
        word_lemma_d = dict()
        for word, tag in tags:
            pos = get_wordnet_pos(tag)
            #            print("word=%s,tag=%s,pos=%s" %(word, tag, pos))
            if pos is not None:
                lemma = self.lemmatizer.lemmatize(word, pos)
                if lemma is not None:
                    #                    print(lemma)
                    word_lemma_d[word] = lemma
                else:
                    print("Lemma not found for word=" + word)
        print("Word_lemma_dictionary size=%s" % (len(word_lemma_d.items())))
        return word_lemma_d

    def lookup_lemma_from_word(self, words, word):
        '''
        Given a word, lookup its lemma
        :param word:
        :returns lemma
        '''
        if self.word_lemma_d == None:
            self.word_lemma_d = self.create_word_to_lemma_dictionary(words)
        if word in self.word_lemma_d:
            lemma = self.word_lemma_d[word]
            return lemma
        return None

    def clean_data(self, words):
        '''
        Given a list of words, remove the stop words and returns the most common words
        :param filename:
        :param lemma:
        :returns filtered_words
        '''
        stops = set(stopwords.words("english"))
        filtered_words = [word for word in words if word not in stops]
        print(filtered_words[:10])
        count = collections.Counter(filtered_words).most_common()
        dictionary = dict()
        for word, _ in count:
            dictionary[word] = len(dictionary)
        data = list()
        for word in words:
            if word in dictionary:
                data.append(word)
        return data

    def load_dict(self, filename):
        '''
        Loads senses to names dictionary.
        :param filename
        :returns senses_names_d
         '''
        f = '%s%s' % (self.path, filename)
        senses_names_d = pickle.load(open(f, "rb"))
        return senses_names_d

    def generate_synsetids(self, words):
        '''
        This function generates a list of synsetids for a given list of words by mapping each word to its synsetid.
        Given a list of words:
        1. Removes stop words and keeps most common words
        2. for each word:
           - finds the lemma
           - using its lemma find the synsetid
           - if there is no synsetId then try to disambiguate the word using lesk
           - It is slow to disambuguate a larger corpus so only a window around the word to disambiguate is given
             as context of disambiguation
           - if we have a synset then add it to the list of synsetids if not just add the ambiguous word
        :param words:
        :return: synsetids
        '''
        filtered_words = self.clean_data(words)
        print("Words", filtered_words[:5])
        unknown_count = 0
        synsetids = []
        i = 0
        words_sz = len(words)
        for word in words:
            lemma = self.lookup_lemma_from_word(words, word)
            if lemma is None:
                unknown_count += 1
                synsetid = word
            else:
                synsetid = self.lookup_synsetid_from_lemma(lemma)
                if synsetid is None:
                    # synset = lesk(sentence, word)
                    (li, ri) = get_window_indices(words_sz, i, self.win_size)
                    win_words = words[li: i] + words[i: ri + 1]
                    # print(win_words)
                    synset = lesk(win_words, word)
                    if synset is None:
                        unknown_count += 1
                        synsetid = word
                    else:
                        synsetid = synset.offset()
            if synsetid == word:
                print("No synsetid for=", word)
            else:
                print("word=%s, synsetName=%s, synsetId=%d" % (
                word, self.lookup_synset_name_from_synsetid(synsetid), synsetid))
            synsetids.append(synsetid)
            i += 1
        synsetids_size = len(synsetids)
        print(synsetids_size)
        if synsetids_size != 0:
            rejected_percent = 100.0 * (unknown_count / synsetids_size)
        else:
            rejected_percent = 0
        print("Synsetids_size list size=%d, unknown=%.3f" % (synsetids_size, rejected_percent))
        return synsetids

    def generate_lemma_pos_with_win(self, words):
        '''
        This function generates lemma to POS using the selected tagger (by default Perceptron Tager)
        and a lemmatizer to find given a word its lemma and POS using a context window.
        It returns a list of (lemma, POS).
        :param words:
        :return: tuples
        '''
        #filtered_words = self.clean_data(words)
        filtered_words = words
        #print("Filtered words", filtered_words[:5])
        tuples = []
        words_sz = len(filtered_words)
        print("Processing %d words" % words_sz)
        i = 0
        unknown_count = 0
        for word in filtered_words:
            (li, ri) = get_window_indices(words_sz, i, self.win_size)
            win_words = filtered_words[li: i] + filtered_words[i: ri + 1]
            logging.debug("Word=%s, win_words=%s" %(word,win_words))
            tags = []
            try:
                tags = self.tagger.tag(win_words)
            except:
                print("Unexpected error:%s" %sys.exc_info()[0])
            if len(tags) == 0:
                continue
            word_tag_d = dict(tags)
            w_tag = word_tag_d[word]
            if w_tag is None:
                #print("No tag for word=%s" % word)
                i += 1
                unknown_count += 1
                tuples.append((word, None))
                continue
            pos = get_wordnet_pos(w_tag)
            if pos is None:
                #print("No pos for word=%s" % word)
                i += 1
                unknown_count += 1
                tuples.append((word, None))
                continue
            logging.debug("word=%s,tag=%s,pos=%s" %(word, w_tag, pos))
            lemma = word
            try:
                lemma = self.lemmatizer.lemmatize(word, pos)
            except:
                print("Unexpected error:%s" %sys.exc_info()[0])
            # print("Lemma=%s" %lemma)
            tuples.append((lemma, pos))
            # synset = lesk(win_words, lemma, pos)
            # print("Synset=%s" %synset)
            i += 1
            logging.debug("Processing i=%d,total=%d" % (i, words_sz))

        if words_sz != 0:
            rejected_percent = 100.0 * (unknown_count / words_sz)
        else:
            rejected_percent = 0
        print("word_size=%d, unknown=%.3f" % (words_sz, rejected_percent))
        return tuples

    def generate_lemma_pos(self, words):
        '''
        This is a similar function than the one above without contextual window.
        :param words:
        :return: tuples
        '''
        words_sz = len(words)
        tags = []
        try:
            tags = self.tagger.tag(words)
        except:
            print("Unexpected error:%s" %sys.exc_info()[0])
        #print('Tags=%s' % tags[:5])
        i = 0
        tuples = []
        unknown_count = 0
        for word, tag in tags:
            pos = get_wordnet_pos(tag)
            if pos is None:
                #print("No pos for word=%s" % word)
                tuples.append((word, None))
                i += 1
                unknown_count += 1
                continue
            lemma = word
            try:
                lemma = self.lemmatizer.lemmatize(word, pos)
            except:
                print("Unexpected error:%s" %sys.exc_info()[0])
            #print("word=%s,lemma=%s,tag=%s,pos=%s" % (word, lemma, tag, pos))
            tuples.append((lemma, pos))
            i += 1
        if words_sz != 0:
            rejected_percent = 100.0 * (unknown_count / words_sz)
        else:
            rejected_percent = 0
        #print("word_size=%d, unknown=%.3f" % (words_sz, rejected_percent))
        return tuples

    def generate_lemma_given_pos(self, words, pos):
        '''
       This helper function is used for the questions-answers challenge.
       The tagger is sometimes wrongs thus this generates for the 3 lemmas the given POS.
       It returns a list of (lemma, pos).
       :param words:
       :return: tuples
       '''
        tuples = []
        for word in words:
            lemma = self.lemmatizer.lemmatize(word, pos)
            #print("word=%s,lemma=%s" % (word, lemma))
            tuples.append((lemma, pos))
        return tuples

    def generate_lemma_adj_adv(self, words):
        '''
       This helper function is used for the questions-answers challenge.
       The function returns (lemma1,'a'),(lemma2,'r'),(lemma3,'a'),(lemma4,'r')
       :param words:
       :return: tuples
       '''
        tuples = []
        if len(words) != 4:
            return tuples
        lemma = self.lemmatizer.lemmatize(words[0], 'a')
        tuples.append((lemma, 'a'))
        lemma = self.lemmatizer.lemmatize(words[1], 'r')
        tuples.append((lemma, 'r'))
        lemma = self.lemmatizer.lemmatize(words[2], 'a')
        tuples.append((lemma, 'a'))
        lemma = self.lemmatizer.lemmatize(words[3], 'r')
        tuples.append((lemma, 'r'))
        return tuples

    def generate_lemma_noun_adj(self, words):
        '''
       This helper function is used for the questions-answers challenge.
       The function returns (lemma1,'n'),(lemma2,'a'),(lemma3,'n'),(lemma4,'a')
       :param words:
       :return: tuples
       '''
        tuples = []
        if len(words) != 4:
            return tuples
        lemma = self.lemmatizer.lemmatize(words[0], 'n')
        tuples.append((lemma, 'n'))
        lemma = self.lemmatizer.lemmatize(words[1], 'a')
        tuples.append((lemma, 'a'))
        lemma = self.lemmatizer.lemmatize(words[2], 'n')
        tuples.append((lemma, 'n'))
        lemma = self.lemmatizer.lemmatize(words[3], 'a')
        tuples.append((lemma, 'a'))
        return tuples

    def transform(self, synsetids):
        words = []
        for s in synsetids:
            w = self.lookup_synset_name_from_synsetid(s)
            # if w is None:
            words.append(str(w))
            # else:
            #     words.append(w)
        return words
Ejemplo n.º 43
0
df.columns = docs.get('ColNames')
print(df)

#================================================================
#****************************************************************

# PerceptronTagger

# Need to import and download averaged_perceptron_tagger
import nltk
nltk.download('averaged_perceptron_tagger')

# Process the text data
from nltk.tag.perceptron import PerceptronTagger
PT = PerceptronTagger()
print(PT.tag('This is a sample English sentence'.split()))
#----output----
[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('English', 'JJ'),
('sentence', 'NN')]

# To get help about tags
nltk.help.upenn_tagset('NNP')  # can run this in IPython Console

# Alternatively, use this method involving tokenizer 
import nltk
nltk.download('punkt') # can run this in IPython Console

from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

Percept_list = pos_tag(word_tokenize("John's big idea isn't all that bad."))
from matplotlib.pyplot import pyplot as plt

tagger = PerceptronTagger()
name_tag = []
non_name_tag = []
full_names = []
non_names = []

with open('full_names.txt') as infile:
    full_names = infile.read().splitlines()

with open('non_names.txt') as infile:
    non_names = infile.read().splitlines()

for x in full_names:
    tags = nltk.ne_chunk(tagger.tag(x.split())).pos()
    name_tag.append(tags[0][0][1]+' '+tags[0][1]+' '+tags[1][0][1]+' '+tags[1][1])

for x in non_names:
    tags = nltk.ne_chunk(tagger.tag(x.split())).pos()
    non_name_tag.append(tags[0][0][1]+' '+tags[0][1]+' '+tags[1][0][1]+' '+tags[1][1])

name_tag = [w.split() for w in name_tag]
non_name_tag = [w.split() for w in non_name_tag]

train_data = np.vstack((np.array(name_tag), np.array(non_name_tag)))
train_f1 = label_binarize(train_data[:,0], classes=list(set(train_data[:,0])))
train_f2 = label_binarize(train_data[:,1], classes=list(set(train_data[:,1])))
train_f3 = label_binarize(train_data[:,2], classes=list(set(train_data[:,2])))
train_f4 = label_binarize(train_data[:,3], classes=list(set(train_data[:,3])))
Ejemplo n.º 45
0
out_filename = './sentences/subj-verb.txt'
f = open(out_filename, 'w')

with_verb_count = 0  # count how many sentences match our search

for i, sent in enumerate(corpus, start=1):
    # Logging
    if i % 10000 == 0:
        print('{} sentences analysed.'.format(i))

    # Prune sentences with unknown tokens
    if '<unk>' in sent:
        continue

    tagged_sent = tagger.tag(sent.split())
    with_verb = False

    for w, t in tagged_sent:
        # 3rd-singular or non-3rd-singular, present tense
        if t in ('VBZ', 'VBP'):
            with_verb = True
            sent = sent.replace(w, '*{}*'.format(w), 1)

    if with_verb:
        print(sent, file=f)
        with_verb_count += 1

print('>> {} sentences with VBZs and VBPs were saved to {}'.format(
    with_verb_count, out_filename))
f.close()
                    print file_count, f[:-4]
                    try:
                        file_text = os.path.join(dirname, f[:-4] + ".txt")
                        text_file = open(file_text, "r")
                        raw_text = unicode(text_file.read(), encoding="utf-8")
                        file_kpe = os.path.join(dir_output, f[:-4] + ".ann")
                        kpe_file = open(file_kpe, "w")
                    except:
                        print >> sys.stderr, "E) Open files: ", sys.exc_info()

                    text_tokens = tokenizer.tokenize(raw_text)
                    text_tokens = kpcommon.escape_not_abbreviations(text_tokens)
                    #if debug:
                    #    print text_tokens, len(text_tokens)

                    pos_tags = tagger.tag(text_tokens)
                    pos_tags_string = " ".join([pt[1] for pt in pos_tags])
                    #print "\n".join([str(ptstr) for ptstr in pos_tags])
                    kps_candidates = {}
                    test_match = 0
                    for ct in pos_sequences:
                        if is_posregex:
                            pos_regex = re.compile("\s?(" + ct[2] + ")\s")
                        else:
                            pos_regex = re.compile("\s?(" + re.escape(ct[2]) + ")\s")
                        for pos_match in re.finditer(pos_regex, pos_tags_string):
                            pos_match_string = pos_match.group(1)
                            pos_seq_start = pos_match.start(1)
                            pos_seq_end = pos_match.end(1)
                            token_index_start = len(pos_tags_string[:pos_seq_start].split())
                            token_offset = len(pos_tags_string[pos_seq_start:pos_seq_end].split())