Ejemplo n.º 1
0
def ap(train_path, test_path):
    modelref = 'ap-' + md5(('ap///' + train_path).encode()).hexdigest() + '.pickle'

    test_sentences = list(gen_corpus(test_path))

    if not isfile(modelref):
        start = perf_counter()
        training_sentences = list(gen_corpus(train_path))

        ap_model = PerceptronTagger(load=False)
        ap_model.train(list(convert_sents_to_zipped(training_sentences)), save_loc=modelref)
        end = perf_counter()
        print('Training took {} ms.'.format(int((end - start) * 1000)))
    else:
        ap_model = PerceptronTagger(load=False)
        ap_model.load(modelref)
        print('Model loaded from file.')

    # Evaluation
    start = perf_counter()
    y_pred, y_true = [], []
    for words, tags in test_sentences:
        y_pred.extend(y for x, y in ap_model.tag(words))
        y_true.extend(tags)

    end = perf_counter()
    print('Testing took {} ms.'.format(int((end - start) * 1000)))

    for l in classification_report(y_true, y_pred).split('\n'):
        print(l)
Ejemplo n.º 2
0
def _get_tagger(lang=None):
    if lang == "rus":
        tagger = PerceptronTagger(False)
        ap_russian_model_loc = "file:" + str(find(RUS_PICKLE))
        tagger.load(ap_russian_model_loc)
    else:
        tagger = PerceptronTagger()
    return tagger
Ejemplo n.º 3
0
def _get_tagger(lang=None):
    if lang == 'rus':
        tagger = PerceptronTagger(False)
        ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
        tagger.load(ap_russian_model_loc)
    elif lang == 'eng':
        tagger = PerceptronTagger()
    else:
        tagger = PerceptronTagger()
    return tagger
Ejemplo n.º 4
0
    def __init__(self, stop_words=None, sent_detector=None, documents=None):
        # Load a persisted list of stopwords
        # unless something else is specified
        if not stop_words:
            self._stop_words = stopwords.words('english')
            self._stop_words += ['minister', 'question', 'member', "member’s"]
        else:
            self._stop_words = stop_words
        # Load these to save time reading from
        # disk later
        if not sent_detector:
            self._sent_detector = nltk.data.load(
                'tokenizers/punkt/english.pickle')
        else:
            self._sent_detector = sent_detector
        self._tokenizor = TreebankWordTokenizer()
        self._lemmatizer = WordNetLemmatizer()
        self._tagger = PerceptronTagger()

        if documents:
            # Create a list of lists of tokens all lowercased
            # , lemmatized and filtered
            self.documents = self._make_tokens(documents)
            self.dictionary = corpora.Dictionary(self.documents)
            self.corpus = self._bag_o_words(self.documents)
            self._tf_idf_model = models.TfidfModel(self.corpus)
            self.transformed_corpus = self._tf_idf_model[self.corpus]
        else:
            self.documents = None
Ejemplo n.º 5
0
    def wordTagging(self, sentence):
        posWords = []
        # text = nltk.word_tokenize(sentence)
        tagset = None
        start_time = time.time()
        try:
            tokens = nltk.word_tokenize(sentence)
        except UnicodeDecodeError:
            tokens = nltk.word_tokenize(sentence.decode('utf-8'))

        tagger = PerceptronTagger()
        pos_arrs = nltk.tag._pos_tag(tokens, tagset, tagger)
        for pos_arr in pos_arrs:
            pos = []
            pos.append(pos_arr[0])
            pos.append(pos_arr[1])
            # print pos_arr
            if pos_arr[1] in self.posDictionary:
                ppt = self.posDictionary[pos_arr[1]]
            else:
                ppt = "symbol"

            pos.append(ppt)
            posWords.append(pos)

        return posWords
Ejemplo n.º 6
0
def Top_K_verbs(k, df, text_col, class_col, plot=False):
    vect = TfidfVectorizer()
    vect.fit(df[text_col])
    tfidf_df = pd.DataFrame(vect.transform(df[text_col]).toarray())
    tfidf_df.columns = vect.get_feature_names()
    tfidf_T = tfidf_df.transpose()
    tagger = PerceptronTagger()
    tfidf_T['pos'] = tagger.tag(tfidf_T.index)
    tfidf_T = tfidf_T[tfidf_T['pos'].apply(
        lambda tup: tup[1] in ['VB', 'VBD', 'VBG', 'VBN'])]
    tfidf_df = tfidf_T.drop(['pos'], axis=1).transpose()
    top_k_by_class = dict()
    for v in df[class_col].value_counts().index:
        freq_in_class = tfidf_df[df[class_col] == v].sum(axis=0).sort_values(
            ascending=False)
        frac_in_class = freq_in_class / freq_in_class.sum()
        top_k_by_class[v] = frac_in_class[:k].index

        if plot:
            print('the top {} frequent nouns for class {}:'.format(k, v))
            plt.figure(figsize=(5, 10))
            sns.barplot(y=frac_in_class[:k].index, x=frac_in_class[:k])
            plt.xlabel('fraction')
            plt.show()

    return (top_k_by_class)
Ejemplo n.º 7
0
 def get_instances(self, label_file, xml_file):
     instances = []
     labels_final = set()
     tagger = PerceptronTagger(
     )  # load nltk perceptron just once to speed up tagging
     labels_dict = {
         0: "anger",
         1: "disgust",
         2: "fear",
         3: "joy",
         4: "sadness",
         5: "surprise"
     }
     tree = ET.parse(xml_file)
     root = tree.getroot()
     with open(label_file) as f:
         for sent, line in izip(root, f):
             id_xml = sent.attrib.values()[0]
             id_labels = line.rstrip().split()
             id_file = id_labels[0]
             if id_xml == id_file:
                 for i in sent.itertext():
                     text = i
                 labels = id_labels[1:]
                 label = labels.index(
                     str(max([int(label) for label in labels])))
                 inst = Instance(text, labels_dict[label])
                 inst_tokenized = word_tokenize(text)
                 inst_tagged = tagger.tag(inst_tokenized)
                 for tokentag in inst_tagged:
                     token = Token(tokentag[0], tokentag[1])
                     inst.add_token(token)
                 instances.append(inst)
                 labels_final.add(label)
         return instances, labels_final
Ejemplo n.º 8
0
    def __init__(self, dbFilePath=None, revDbFilePath=None, pos=False):
        self.dbFilePath = dbFilePath
        self.revDbFilePath = revDbFilePath
        if pos:
            self.tagger = PerceptronTagger()
        self.pos = pos

        # try to open forward database
        if not dbFilePath:
            self.dbFilePath = os.path.join(os.path.dirname(__file__),
                                           "markovdb")
        try:
            with open(self.dbFilePath, 'rb') as dbfile:
                self.db = pickle.load(dbfile)
        except (IOError, ValueError):
            logging.warn(
                'Database file corrupt or not found, using empty database')
            self.db = _db_factory()

        # try to open backwards database
        if not revDbFilePath:
            self.revDbFilePath = os.path.join(os.path.dirname(__file__),
                                              "revmarkovdb")
        try:
            with open(self.revDbFilePath, 'rb') as dbfile:
                self.rev_db = pickle.load(dbfile)
        except (IOError, ValueError):
            logging.warn(
                'Database file corrupt or not found, using empty database')
            self.rev_db = _db_factory()
Ejemplo n.º 9
0
 def get_instances(self, folder):
     # happiness/joy???????????????????????????
     labels_dict = {
         "hp": "joy",
         "sd": "sadness",
         "ag": "anger",
         "dg": "disgust",
         "sp": "surprise",
         "fr": "fear"
     }
     instances = []
     labels = set()
     tagger = PerceptronTagger(
     )  # load nltk perceptron just once to speed up tagging
     with open(folder) as f:
         for line in f:
             label, id, text = line.strip().split(
                 " ", 2)  # split by first two spaces only
             if label == "ne":  # ignore no emotion
                 continue
             inst = Instance(text, labels_dict[label])
             inst_tokenized = word_tokenize(text)
             inst_tagged = tagger.tag(inst_tokenized)
             for tokentag in inst_tagged:
                 token = Token(tokentag[0], tokentag[1])
                 inst.add_token(token)
             instances.append(inst)
             labels.add(label)
     return instances, labels
 def __init__(self):
   self.hmm_models = []
   self.n_hmm = 0
   self.hmm2idx = {}
   self.idx2hmm = {}
   self.tagger = PerceptronTagger()
   return
Ejemplo n.º 11
0
def pos_tagger(infile, outfile):
    start = time.time()
    tagged_ngrams_counter = Counter()
    tagger = PerceptronTagger()
    with io.open(infile, encoding='utf-8', mode='rt') as text_file:
        for i, line in enumerate(text_file):
            if i % 100000 == 0:
                print(
                    f'{os.getpid()} process, {i} lines, {time.time()-start:.1f} time'
                )
            if n == 1:
                tagged_ngrams = tagger.tag(line.rstrip().split(' '))
            else:
                tagged_ngrams = [
                    tuple(tagger.tag(ngram))
                    for ngram in get_ngrams(line.rstrip().split(' '), n)
                ]
            tagged_ngrams_counter.update(tagged_ngrams)
        with open(outfile, mode='wb') as counter_pickle, \
            open(outfile[:-6]+'csv', 'w', encoding='utf-8') as counter_csv:
            pickle.dump(tagged_ngrams_counter, counter_pickle)
            counter_csv.write(csv_headers[n])
            if n == 1:
                for tagged_ngram, count in tagged_ngrams_counter.items():
                    counter_csv.write('{} {} {}\n'.format(
                        *tagged_ngram, count))
            else:
                for tagged_ngram, count in tagged_ngrams_counter.items():
                    ngram, tags = zip(*tagged_ngram)
                    counter_csv.write('{} {} {}\n'.format(
                        ' '.join(ngram), ' '.join(tags), count))
def get_all_terms_in_sent(reg_exp, sent):
    tokens = nltk.word_tokenize(sent)
    # tags = nltk.pos_tag(tokens)
    pretrain = PerceptronTagger()
    tags = pretrain.tag(tokens)
    tags = [[tag[0], tag[1]] for tag in tags]
    if (not (tags[0][1].startswith("NNP"))):
        tags[0][0] = tags[0][0].lower()
    tag_string = "".join(get_tag_string(tags))
    p = re.compile(reg_exp)
    res = []
    retrieved_phrases = p.finditer(tag_string)
    for m in retrieved_phrases:
        np_lst = [
            tok for (tok, tag) in tags[m.start():m.start() + len(m.group())]
        ]
        tag_lst = [
            interpret_tag(tag)
            for (tok, tag) in tags[m.start():m.start() + len(m.group())]
        ]
        res.append(" ".join(np_lst))
        if "P" in tag_lst:
            idx = tag_lst.index("P")
            res.append(" ".join(np_lst[:idx]))
            res.append(" ".join(np_lst[idx + 1:]))
    return res
Ejemplo n.º 13
0
    def __init__(self):

        self.tagger = PerceptronTagger()
        self.alarm_ai = AlarmResponse()
        self.converter_ai = ConversionResponse()
        self.summarize_ai = SummarizerResponse()
        self.news_ai = NewsResponse()
        self.maps_ai = MapsResponse()
        self.lemmatizer = WordNetLemmatizer()
        self.parser = spy.English()
        self.conversion = ConversionService()
        self.ai = {
            "alarm_api": self.alarm_ai,
            "unit_conversion": self.converter_ai,
            "summarization": self.summarize_ai,
            "news": self.news_ai,
            "maps_api": self.maps_ai
        }
        self.classifiers = []
        with open('nb_dumped_classifier.pkl', 'rb') as fid:
            self.classifiers.append(cPickle.load(fid))
        with open('sgd_dumped_classifier.pkl', 'rb') as fid:
            self.classifiers.append(cPickle.load(fid))
        with open('pla_dumped_classifier.pkl', 'rb') as fid:
            self.classifiers.append(cPickle.load(fid))
        self.previous_ents = {"PERSON": "", "GPE": ""}
        self.special_tags = {
            'PERSON': ['his', 'her', 'him', 'he', 'she'],
            'GPE': ['it', 'there']
        }
Ejemplo n.º 14
0
    def __init__(self,
                 vocab_file,
                 do_lower_case=True,
                 never_split=None,
                 additional_special_tokens=[
                     "[JJR]", "[JJS]", "[NNS]", "[NNP]", "[NNPS]", "[RBR]",
                     "[RBS]", "[VBD]", "[VBG]", "[VBN]", "[VBP]", "[VBZ]"
                 ],
                 **kwargs):
        self.inflection_tokens = additional_special_tokens
        self.tagger = PerceptronTagger()
        super().__init__(vocab_file,
                         do_lower_case=do_lower_case,
                         never_split=never_split,
                         additional_special_tokens=additional_special_tokens,
                         **kwargs)

        self.have_inflections = {'NOUN', 'ADJ', 'VERB'}
        self.lemma_tags = {'NN', 'VB', 'JJ', 'RB', 'MD', "NNP"}
        self.do_lower_case = do_lower_case
        if do_lower_case:
            self.cased_tokenizer = BasicTokenizer(do_lower_case=False,
                                                  never_split=never_split)
        else:
            self.cased_tokenizer = self.basic_tokenizer
Ejemplo n.º 15
0
    def __init__(self):
        import nltk
        nltk.download('averaged_perceptron_tagger')

        from nltk.tag.perceptron import PerceptronTagger

        self.inst = PerceptronTagger()
Ejemplo n.º 16
0
def pos_tag(tokens, tagset=None):
    """
    Use NLTK's currently recommended part of speech tagger to
    tag the given list of tokens.

        >>> from nltk.tag import pos_tag
        >>> from nltk.tokenize import word_tokenize
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
        [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
        ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal')
        [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'),
        ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')]

    NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence.

    :param tokens: Sequence of tokens to be tagged
    :type tokens: list(str)
    :param tagset: the tagset to be used, e.g. universal, wsj, brown
    :type tagset: str
    :return: The tagged tokens
    :rtype: list(tuple(str, str))
    """
    tagger = PerceptronTagger()
    return _pos_tag(tokens, tagset, tagger)
Ejemplo n.º 17
0
    def __init__(self, extended=False):
        self.__chunk_patterns = r""" #  helps us find noun phrase chunks
                NP: {<DT>?<JJ.*>*<NN.*>+}
                    {<NN.*>+}
                """
        # create a chunk parser
        self.__np_chunker = nltk.RegexpParser(self.__chunk_patterns)

        # now define the Hearst patterns
        # format is <hearst-pattern>, <hypernym_location>
        # so, what this means is that if you apply the first pattern,
        self.__hearst_patterns = [
                ("(NP_\w+ (, )?such as (NP_\w+ ? (, )?(and |or )?)+)", "first"),
                ("(NP_such_\w+ as (NP_\w+ ? (, )?(and |or )?)+\w+)", "first"),
                ("((NP_\w+ (, )?)+( )?(and |or )?NP_other_\w+)", "last"),
                ("(NP_\w+ (, )?including (NP_\w+ (, )?(and |or )?)+\w+)", "first"),
                ("(NP_\w+ (, )?especially (NP_\w+ (, )?(and |or )?)+\w+)", "first")

                # ''' IMPLEMENT ADDITIONAL HEARST PATTERNS HERE '''
            ]

        if extended:
            self.__hearst_patterns.extend([
                ("(NP_\w+ (, )?such as (NP_\w+ ? (, )?(and |or )?)+)", "first"),
                #''' IMPLEMENT ADDITIONAL PATTERNS HERE '''
            ])

        self.__pos_tagger = PerceptronTagger()
Ejemplo n.º 18
0
def tag_text_en(tokens, tokens_span):
    """Receive tokens and spans and return tuple list with tagged tokens"""
    tagger = PerceptronTagger()
    tags = []
    for i, tagged in enumerate(tagger.tag(tokens)):
        tags.append(tagged + (tokens_span[i], []))
    return tags
Ejemplo n.º 19
0
    def __init__(self, hparams, dataset_type="relocar"):
        self.hparams = hparams
        self.dataset_type = dataset_type

        self._get_labels()
        self._get_word_dict()
        self._pad_idx = self.hparams.pad_idx

        # POS Tagger
        perceptron_tagger = PerceptronTagger()
        self.pos2id = {"<PAD>": 0}
        for pos_tag in list(perceptron_tagger.classes):
            self.pos2id[pos_tag] = len(self.pos2id)
        # Stanford NER
        self.ne2id = {
            "<PAD>": 0,
            "O": 1,
            "LOCATION": 2,
            "ORGANIZATION": 3,
            "PERSON": 4
        }

        if self.hparams.do_bert and not self.hparams.do_roberta and not self.hparams.do_xlnet:
            self._bert_tokenizer_init()
        elif self.hparams.do_bert and self.hparams.do_roberta:
            self._roberta_tokenizer_init()
        elif self.hparams.do_bert and self.hparams.do_xlnet:
            self._xlnet_tokenizer_init()
Ejemplo n.º 20
0
def main():
    input_fname = 'small'
    if len(sys.argv) > 1:
        input_fname = sys.argv[1]

    tknzr = TweetTokenizer()
    tagger = PerceptronTagger()

    fout = (
        'embeddings/smiley_tweets_embedding_expanded_{}'.format(input_fname))
    fname, delimiter, ndim = (
        'embeddings/smiley_tweets_embedding_{}'.format(input_fname), ' ', 52)
    word2vec = load_glove_vec(fname, {}, delimiter, ndim)

    tagdict = tagger.tagdict
    tagidx = {}
    nRows = len(word2vec)
    nCols = len(tagdict)

    print nRows, ':', nCols

    counter = 0
    for tag in tagdict.keys():
        tagidx[tag] = counter
        counter += 1

    exp_wemb = {}
    for word in word2vec.keys():
        exp_wemb[word] = np.zeros(nCols)

    print tagidx

    train = "semeval/task-B-train-plus-dev.tsv.gz"
    test = "semeval/task-B-test2014-twitter.tsv.gz"
    dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz"
    test15 = "semeval/task-B-test2015-twitter.tsv.gz"
    smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname)

    it = 0
    files = [train, test, dev, test15, smiley_pos]
    for filen in files:
        for tweet in gzip.open(filen, 'rb'):
            tweet = tknzr.tokenize(tweet.decode('utf-8'))
            tags = _pos_tag(tweet, None, tagger)
            for (word, tag) in tags:
                if word in exp_wemb.keys() and tag in tagidx.keys():
                    idx = tagidx[tag]
                    exp_wemb[word][idx] = 1
            if (it % 10) == 0:
                print 'Progress:', it
            it += 1

    f = open(fout, 'wb')
    for word in exp_wemb:
        f.write(word)
        tags = exp_wemb[word]
        for i in np.nditer(tags):
            f.write(' {}'.format(i))
        fname.write("\n")
Ejemplo n.º 21
0
def postag(txt):
    precptag = PerceptronTagger()
    input_list=txt.split()
    try:
        tagged=nltk.tag._pos_tag(tokens=input_list, tagset=None, tagger=precptag, lang='eng')
    except:
        tagged=nltk.tag._pos_tag(tokens=input_list, tagset=None, tagger=precptag)
    return tagged
Ejemplo n.º 22
0
 def _get_tagger():
     # TODO: Instead of manually downloading the dutch_tagger, download it from an external source if it isn't installed at Data/
     try:
         os.chdir(r"Data")
         tagger = PerceptronTagger(load=False)
         tagger.load('model.perc.dutch_tagger_small.pickle')
         return tagger
     except (IndexError, FileNotFoundError):
         return None
Ejemplo n.º 23
0
def nltk_perceptron_pos_tagger(input_dict):
    name= 'PerceptronPosTagger'
    if not input_dict['training_corpus']:
        perceptron_tagger = PerceptronTagger()
        name += '-pretrained'
    else: 
        perceptron_tagger = PerceptronTagger(load=False)
        chunk = input_dict['training_corpus']['chunk']
        corpus = input_dict['training_corpus']['corpus']
        training_corpus=corpus_reader(corpus, chunk)
        perceptron_tagger.train(list(training_corpus))

    return {'pos_tagger': {
                'function':'tag_sents',
                'object': perceptron_tagger,
                'name': name
            }
    }
Ejemplo n.º 24
0
def filter_input(txt):
    precptag = PerceptronTagger()
    input_list=txt.lower().split()
    temp_list=nltk.tag._pos_tag(input_list, None, tagger=precptag); return_list=[]
    for temp in temp_list:
        if temp[1] not in ['CC','WDT','WP','WRB']: 
            return_list.append(temp[0])
    #return_list=remove_stopwords(return_list)
    return return_list
Ejemplo n.º 25
0
 def test_perceptron_tagging(self):
     sentence = "This is a test sentence to test if the testing works."
     tokens = word_tokenize(sentence)
     pt = PerceptronTagger(load=True)
     tag_result1 = [x[1] for x in pt.tag(tokens)]
     pt2 = perctagger()
     pt2.load()
     tag_result2 = pt2.tag(tokens)
     self.assertListEqual(tag_result1, tag_result2)
Ejemplo n.º 26
0
def pos_sequence_from(keyphrase, tags):
    """Receive keyphrase dict and return list of tags"""
    pos_sequence = list(map(lambda i: tags[i][1], keyphrase["tokens-indices"]))
    # Special case when tokenization don't match with annotation
    if pos_sequence == []:
        tagger = PerceptronTagger()
        keyphrase_tags = tagger.tag(keyphrase['keyphrase-text'].split())
        pos_sequence = list(map(lambda t: t[1], keyphrase_tags))
    return pos_sequence
    def __init__(self, df_train, df_valid, df_test, args, unk=True):

        self.dictionary = Dictionary(max_vocab_size=args.max_vocab_size)
        self.unk = unk
        self.classification = args.classification
        self.transfer_learning = args.transfer_learning
        self.max_length = args.n_ctx
        self.pos = args.POS_tags
        self.pos_tags = set()
        self.bpe = args.bpe
        self.lang = args.lang
        if self.bpe:
            self.sp = spm.SentencePieceProcessor()
            self.sp.Load(args.bpe_model_path)

        if self.pos:
            self.tagger = PerceptronTagger()

        if self.transfer_learning:

            with open(args.dict_path, 'rb') as file:
                self.dictionary = pickle.load(file)
        else:
            self.tokenize_df(df_train)
            self.tokenize_df(df_valid)
            self.tokenize_df(df_test)
            self.dictionary.sort_words(unk=self.unk, pos_tags=self.pos_tags)

            if not self.transfer_learning:

                with open(args.dict_path, 'wb') as file:
                    pickle.dump(self.dictionary, file)

        if not self.classification:
            if self.pos:
                self.train, self.train_pos = self.tokenize_(df_train)
                self.valid, self.valid_pos = self.tokenize_(df_valid)
                self.test, self.test_pos = self.tokenize_(df_test)
            else:
                self.train = self.tokenize_(df_train)
                self.valid = self.tokenize_(df_valid)
                self.test = self.tokenize_(df_test)
        else:
            if self.pos:
                self.train, self.train_pos, self.train_target, self.train_keywords, self.train_stemmed_string = self.tokenize_doc(
                    df_train, max_length=self.max_length)
                self.valid, self.valid_pos, self.valid_target, self.valid_keywords, self.valid_stemmed_string = self.tokenize_doc(
                    df_valid, max_length=self.max_length, valid=False)
                self.test, self.test_pos, self.test_target, self.test_keywords, self.test_stemmed_string = self.tokenize_doc(
                    df_test, max_length=self.max_length)
            else:
                self.train, self.train_target, self.train_keywords, self.train_stemmed_string = self.tokenize_doc(
                    df_train, max_length=self.max_length)
                self.valid, self.valid_target, self.valid_keywords, self.valid_stemmed_string = self.tokenize_doc(
                    df_valid, max_length=self.max_length, valid=False)
                self.test, self.test_target, self.test_keywords, self.test_stemmed_string = self.tokenize_doc(
                    df_test, max_length=self.max_length)
Ejemplo n.º 28
0
 def __init__(self, model_type='english-bidirectional-distsim.tagger'):
     """
     Args:
         model:  model available in $STANFORD_MODELS:
             english-bidirectional-distsim.tagger
             english-caseless-left3words-distsim.tagger
             english-left3words-distsim.tagger
     """
     #self.eng_tagger = StanfordPOSTagger(model_type, java_options='-mx16000m')
     self.eng_tagger = PerceptronTagger()
def pos_tokenizer(text):
    word_tokens = tokenize(text)
    # using pretrained model to tag all tokens
    pretrained_tagger = PerceptronTagger(load=True)
    results = pretrained_tagger.tag(word_tokens)
    # collecting pos from resulting tuples
    pos_tokens = []
    for word_pos in results:
        pos_tokens.append(word_pos[1])
    return pos_tokens
Ejemplo n.º 30
0
 def __init__(self):
     # Prepare levels
     self.levels["USAGE"] = 1
     self.levels["RESULT"] = 2
     self.levels["MODEL-FEATURE"] = 3
     self.levels["PART_WHOLE"] = 4
     self.levels["TOPIC"] = 5
     self.levels["COMPARE"] = 6
     self.pos_tagger = PerceptronTagger()
     self.stopwords = nltk.corpus.stopwords.words('english')