def get_instances(self, label_file, xml_file):
     instances = []
     labels_final = set()
     tagger = PerceptronTagger(
     )  # load nltk perceptron just once to speed up tagging
     labels_dict = {
         0: "anger",
         1: "disgust",
         2: "fear",
         3: "joy",
         4: "sadness",
         5: "surprise"
     }
     tree = ET.parse(xml_file)
     root = tree.getroot()
     with open(label_file) as f:
         for sent, line in izip(root, f):
             id_xml = sent.attrib.values()[0]
             id_labels = line.rstrip().split()
             id_file = id_labels[0]
             if id_xml == id_file:
                 for i in sent.itertext():
                     text = i
                 labels = id_labels[1:]
                 label = labels.index(
                     str(max([int(label) for label in labels])))
                 inst = Instance(text, labels_dict[label])
                 inst_tokenized = word_tokenize(text)
                 inst_tagged = tagger.tag(inst_tokenized)
                 for tokentag in inst_tagged:
                     token = Token(tokentag[0], tokentag[1])
                     inst.add_token(token)
                 instances.append(inst)
                 labels_final.add(label)
         return instances, labels_final
Example #2
0
    def __init__(self, extended=False):
        self.__chunk_patterns = r""" #  helps us find noun phrase chunks
                NP: {<DT>?<JJ.*>*<NN.*>+}
                    {<NN.*>+}
                """
        # create a chunk parser
        self.__np_chunker = nltk.RegexpParser(self.__chunk_patterns)

        # now define the Hearst patterns
        # format is <hearst-pattern>, <hypernym_location>
        # so, what this means is that if you apply the first pattern,
        self.__hearst_patterns = [
                ("(NP_\w+ (, )?such as (NP_\w+ ? (, )?(and |or )?)+)", "first"),
                ("(NP_such_\w+ as (NP_\w+ ? (, )?(and |or )?)+\w+)", "first"),
                ("((NP_\w+ (, )?)+( )?(and |or )?NP_other_\w+)", "last"),
                ("(NP_\w+ (, )?including (NP_\w+ (, )?(and |or )?)+\w+)", "first"),
                ("(NP_\w+ (, )?especially (NP_\w+ (, )?(and |or )?)+\w+)", "first")

                # ''' IMPLEMENT ADDITIONAL HEARST PATTERNS HERE '''
            ]

        if extended:
            self.__hearst_patterns.extend([
                ("(NP_\w+ (, )?such as (NP_\w+ ? (, )?(and |or )?)+)", "first"),
                #''' IMPLEMENT ADDITIONAL PATTERNS HERE '''
            ])

        self.__pos_tagger = PerceptronTagger()
def get_all_terms_in_sent(reg_exp, sent):
    tokens = nltk.word_tokenize(sent)
    # tags = nltk.pos_tag(tokens)
    pretrain = PerceptronTagger()
    tags = pretrain.tag(tokens)
    tags = [[tag[0], tag[1]] for tag in tags]
    if (not (tags[0][1].startswith("NNP"))):
        tags[0][0] = tags[0][0].lower()
    tag_string = "".join(get_tag_string(tags))
    p = re.compile(reg_exp)
    res = []
    retrieved_phrases = p.finditer(tag_string)
    for m in retrieved_phrases:
        np_lst = [
            tok for (tok, tag) in tags[m.start():m.start() + len(m.group())]
        ]
        tag_lst = [
            interpret_tag(tag)
            for (tok, tag) in tags[m.start():m.start() + len(m.group())]
        ]
        res.append(" ".join(np_lst))
        if "P" in tag_lst:
            idx = tag_lst.index("P")
            res.append(" ".join(np_lst[:idx]))
            res.append(" ".join(np_lst[idx + 1:]))
    return res
def pos_titles_from(input_path, output_path = None, options = None):
    finput, foutput = get_streams(input_path, output_path)
    skip, end = get_options(options)
    tokenizer = Tokenizer()
    tagger = PerceptronTagger()
    line_counter = 0
    skipped_lines = 0
    for line in finput:
        log_advance(1000000, line_counter)
        line_counter += 1
        if line_counter <= skip:
            continue
        if end and line_counter > end:
            break
        try:
            paper_id, title = get_fields(line)
            if is_english(title):
                print >> foutput, paper_id
                tokens = tokenizer.tokenize(title)
                for token in tagger.tag(tokens):
                    print >> foutput, token[0], token[1]
                print >> foutput
            else:
                skipped_lines += 1
        except:
            print >> sys.stderr, "Error:", line, sys.exc_info()
    log_nlines(line_counter, skipped_lines)
def tag_text_en(tokens, tokens_span):
    """Receive tokens and spans and return tuple list with tagged tokens"""
    tagger = PerceptronTagger()
    tags = []
    for i, tagged in enumerate(tagger.tag(tokens)):
        tags.append(tagged + (tokens_span[i], []))
    return tags
Example #6
0
    def __init__(self):
        import nltk
        nltk.download('averaged_perceptron_tagger')

        from nltk.tag.perceptron import PerceptronTagger

        self.inst = PerceptronTagger()
Example #7
0
    def __init__(self, stop_words=None, sent_detector=None, documents=None):
        # Load a persisted list of stopwords
        # unless something else is specified
        if not stop_words:
            self._stop_words = stopwords.words('english')
            self._stop_words += ['minister', 'question', 'member', "member’s"]
        else:
            self._stop_words = stop_words
        # Load these to save time reading from
        # disk later
        if not sent_detector:
            self._sent_detector = nltk.data.load(
                'tokenizers/punkt/english.pickle')
        else:
            self._sent_detector = sent_detector
        self._tokenizor = TreebankWordTokenizer()
        self._lemmatizer = WordNetLemmatizer()
        self._tagger = PerceptronTagger()

        if documents:
            # Create a list of lists of tokens all lowercased
            # , lemmatized and filtered
            self.documents = self._make_tokens(documents)
            self.dictionary = corpora.Dictionary(self.documents)
            self.corpus = self._bag_o_words(self.documents)
            self._tf_idf_model = models.TfidfModel(self.corpus)
            self.transformed_corpus = self._tf_idf_model[self.corpus]
        else:
            self.documents = None
Example #8
0
def pos_tagger(infile, outfile):
    start = time.time()
    tagged_ngrams_counter = Counter()
    tagger = PerceptronTagger()
    with io.open(infile, encoding='utf-8', mode='rt') as text_file:
        for i, line in enumerate(text_file):
            if i % 100000 == 0:
                print(
                    f'{os.getpid()} process, {i} lines, {time.time()-start:.1f} time'
                )
            if n == 1:
                tagged_ngrams = tagger.tag(line.rstrip().split(' '))
            else:
                tagged_ngrams = [
                    tuple(tagger.tag(ngram))
                    for ngram in get_ngrams(line.rstrip().split(' '), n)
                ]
            tagged_ngrams_counter.update(tagged_ngrams)
        with open(outfile, mode='wb') as counter_pickle, \
            open(outfile[:-6]+'csv', 'w', encoding='utf-8') as counter_csv:
            pickle.dump(tagged_ngrams_counter, counter_pickle)
            counter_csv.write(csv_headers[n])
            if n == 1:
                for tagged_ngram, count in tagged_ngrams_counter.items():
                    counter_csv.write('{} {} {}\n'.format(
                        *tagged_ngram, count))
            else:
                for tagged_ngram, count in tagged_ngrams_counter.items():
                    ngram, tags = zip(*tagged_ngram)
                    counter_csv.write('{} {} {}\n'.format(
                        ' '.join(ngram), ' '.join(tags), count))
Example #9
0
    def __init__(self,
                 vocab_file,
                 do_lower_case=True,
                 never_split=None,
                 additional_special_tokens=[
                     "[JJR]", "[JJS]", "[NNS]", "[NNP]", "[NNPS]", "[RBR]",
                     "[RBS]", "[VBD]", "[VBG]", "[VBN]", "[VBP]", "[VBZ]"
                 ],
                 **kwargs):
        self.inflection_tokens = additional_special_tokens
        self.tagger = PerceptronTagger()
        super().__init__(vocab_file,
                         do_lower_case=do_lower_case,
                         never_split=never_split,
                         additional_special_tokens=additional_special_tokens,
                         **kwargs)

        self.have_inflections = {'NOUN', 'ADJ', 'VERB'}
        self.lemma_tags = {'NN', 'VB', 'JJ', 'RB', 'MD', "NNP"}
        self.do_lower_case = do_lower_case
        if do_lower_case:
            self.cased_tokenizer = BasicTokenizer(do_lower_case=False,
                                                  never_split=never_split)
        else:
            self.cased_tokenizer = self.basic_tokenizer
Example #10
0
 def get_instances(self, folder):
     # happiness/joy???????????????????????????
     labels_dict = {
         "hp": "joy",
         "sd": "sadness",
         "ag": "anger",
         "dg": "disgust",
         "sp": "surprise",
         "fr": "fear"
     }
     instances = []
     labels = set()
     tagger = PerceptronTagger(
     )  # load nltk perceptron just once to speed up tagging
     with open(folder) as f:
         for line in f:
             label, id, text = line.strip().split(
                 " ", 2)  # split by first two spaces only
             if label == "ne":  # ignore no emotion
                 continue
             inst = Instance(text, labels_dict[label])
             inst_tokenized = word_tokenize(text)
             inst_tagged = tagger.tag(inst_tokenized)
             for tokentag in inst_tagged:
                 token = Token(tokentag[0], tokentag[1])
                 inst.add_token(token)
             instances.append(inst)
             labels.add(label)
     return instances, labels
 def __init__(self):
   self.hmm_models = []
   self.n_hmm = 0
   self.hmm2idx = {}
   self.idx2hmm = {}
   self.tagger = PerceptronTagger()
   return
Example #12
0
def Top_K_verbs(k, df, text_col, class_col, plot=False):
    vect = TfidfVectorizer()
    vect.fit(df[text_col])
    tfidf_df = pd.DataFrame(vect.transform(df[text_col]).toarray())
    tfidf_df.columns = vect.get_feature_names()
    tfidf_T = tfidf_df.transpose()
    tagger = PerceptronTagger()
    tfidf_T['pos'] = tagger.tag(tfidf_T.index)
    tfidf_T = tfidf_T[tfidf_T['pos'].apply(
        lambda tup: tup[1] in ['VB', 'VBD', 'VBG', 'VBN'])]
    tfidf_df = tfidf_T.drop(['pos'], axis=1).transpose()
    top_k_by_class = dict()
    for v in df[class_col].value_counts().index:
        freq_in_class = tfidf_df[df[class_col] == v].sum(axis=0).sort_values(
            ascending=False)
        frac_in_class = freq_in_class / freq_in_class.sum()
        top_k_by_class[v] = frac_in_class[:k].index

        if plot:
            print('the top {} frequent nouns for class {}:'.format(k, v))
            plt.figure(figsize=(5, 10))
            sns.barplot(y=frac_in_class[:k].index, x=frac_in_class[:k])
            plt.xlabel('fraction')
            plt.show()

    return (top_k_by_class)
def _get_tagger(lang=None):
    if lang == 'rus':
        tagger = PerceptronTagger(False)
        ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
        tagger.load(ap_russian_model_loc)
    else:
        tagger = PerceptronTagger()
    return tagger
def pos_sequence_from(keyphrase, tags):
    """Receive keyphrase dict and return list of tags"""
    pos_sequence = list(map(lambda i: tags[i][1], keyphrase["tokens-indices"]))
    # Special case when tokenization don't match with annotation
    if pos_sequence == []:
        tagger = PerceptronTagger()
        keyphrase_tags = tagger.tag(keyphrase['keyphrase-text'].split())
        pos_sequence = list(map(lambda t: t[1], keyphrase_tags))
    return pos_sequence
Example #15
0
 def _get_tagger():
     # TODO: Instead of manually downloading the dutch_tagger, download it from an external source if it isn't installed at Data/
     try:
         os.chdir(r"Data")
         tagger = PerceptronTagger(load=False)
         tagger.load('model.perc.dutch_tagger_small.pickle')
         return tagger
     except (IndexError, FileNotFoundError):
         return None
    def __init__(self, df_train, df_valid, df_test, args, unk=True):

        self.dictionary = Dictionary(max_vocab_size=args.max_vocab_size)
        self.unk = unk
        self.classification = args.classification
        self.transfer_learning = args.transfer_learning
        self.max_length = args.n_ctx
        self.pos = args.POS_tags
        self.pos_tags = set()
        self.bpe = args.bpe
        self.lang = args.lang
        if self.bpe:
            self.sp = spm.SentencePieceProcessor()
            self.sp.Load(args.bpe_model_path)

        if self.pos:
            self.tagger = PerceptronTagger()

        if self.transfer_learning:

            with open(args.dict_path, 'rb') as file:
                self.dictionary = pickle.load(file)
        else:
            self.tokenize_df(df_train)
            self.tokenize_df(df_valid)
            self.tokenize_df(df_test)
            self.dictionary.sort_words(unk=self.unk, pos_tags=self.pos_tags)

            if not self.transfer_learning:

                with open(args.dict_path, 'wb') as file:
                    pickle.dump(self.dictionary, file)

        if not self.classification:
            if self.pos:
                self.train, self.train_pos = self.tokenize_(df_train)
                self.valid, self.valid_pos = self.tokenize_(df_valid)
                self.test, self.test_pos = self.tokenize_(df_test)
            else:
                self.train = self.tokenize_(df_train)
                self.valid = self.tokenize_(df_valid)
                self.test = self.tokenize_(df_test)
        else:
            if self.pos:
                self.train, self.train_pos, self.train_target, self.train_keywords, self.train_stemmed_string = self.tokenize_doc(
                    df_train, max_length=self.max_length)
                self.valid, self.valid_pos, self.valid_target, self.valid_keywords, self.valid_stemmed_string = self.tokenize_doc(
                    df_valid, max_length=self.max_length, valid=False)
                self.test, self.test_pos, self.test_target, self.test_keywords, self.test_stemmed_string = self.tokenize_doc(
                    df_test, max_length=self.max_length)
            else:
                self.train, self.train_target, self.train_keywords, self.train_stemmed_string = self.tokenize_doc(
                    df_train, max_length=self.max_length)
                self.valid, self.valid_target, self.valid_keywords, self.valid_stemmed_string = self.tokenize_doc(
                    df_valid, max_length=self.max_length, valid=False)
                self.test, self.test_target, self.test_keywords, self.test_stemmed_string = self.tokenize_doc(
                    df_test, max_length=self.max_length)
Example #17
0
 def test_perceptron_tagging(self):
     sentence = "This is a test sentence to test if the testing works."
     tokens = word_tokenize(sentence)
     pt = PerceptronTagger(load=True)
     tag_result1 = [x[1] for x in pt.tag(tokens)]
     pt2 = perctagger()
     pt2.load()
     tag_result2 = pt2.tag(tokens)
     self.assertListEqual(tag_result1, tag_result2)
Example #18
0
class Tagger(AbstractTagger):
    def __init__(self):
        self._tagger = PerceptronTagger(load=False)
        self._name = 'nltkperceptron'
        self._model_name = "nltkperceptron"
        self._result = None
        super().__init__()

    def _save_model(self, fpath):
        with open(fpath, 'wb') as f:
            dill.dump(self._tagger, f)

    def load(self, path=''):
        if path == '':
            self._load_model(path)
        else:
            mpath = os.path.join(path, self.model_name)
            self._load_model(mpath)

    def _load_model(self, fpath):
        if fpath == '':
            self._tagger = PerceptronTagger(load=True)
        else:
            with open(fpath, 'rb') as f:
                self._tagger = dill.load(f)

    def tag(self, data):
        res = self._tagger.tag(data)
        return [x[1] for x in res]

    def train(self, data):
        # Reset tagger.
        self._tagger = PerceptronTagger(load=False)
        self._tagger.train(data)

    @property
    def produces_temp_data(self):
        return False

    @property
    def requires_additional_params(self):
        return False

    def set_additional_params(self, options):
        pass

    def add_temp_dir(self, options):
        pass

    @property
    def model_name(self):
        return self._model_name

    @property
    def name(self):
        return self._name
Example #19
0
def _get_tagger(lang=None):
    if lang == "rus":
        tagger = PerceptronTagger(False)
        ap_russian_model_loc = "file:" + str(find(RUS_PICKLE))
        tagger.load(ap_russian_model_loc)
    elif lang == "eng":
        tagger = PerceptronTagger()
    else:
        tagger = PerceptronTagger()
    return tagger
def pos_tokenizer(text):
    word_tokens = tokenize(text)
    # using pretrained model to tag all tokens
    pretrained_tagger = PerceptronTagger(load=True)
    results = pretrained_tagger.tag(word_tokens)
    # collecting pos from resulting tuples
    pos_tokens = []
    for word_pos in results:
        pos_tokens.append(word_pos[1])
    return pos_tokens
Example #21
0
def ap(train_path, test_path):
    modelref = 'ap-' + md5(('ap///' + train_path).encode()).hexdigest() + '.pickle'

    test_sentences = list(gen_corpus(test_path))

    if not isfile(modelref):
        start = perf_counter()
        training_sentences = list(gen_corpus(train_path))

        ap_model = PerceptronTagger(load=False)
        ap_model.train(list(convert_sents_to_zipped(training_sentences)), save_loc=modelref)
        end = perf_counter()
        print('Training took {} ms.'.format(int((end - start) * 1000)))
    else:
        ap_model = PerceptronTagger(load=False)
        ap_model.load(modelref)
        print('Model loaded from file.')

    # Evaluation
    start = perf_counter()
    y_pred, y_true = [], []
    for words, tags in test_sentences:
        y_pred.extend(y for x, y in ap_model.tag(words))
        y_true.extend(tags)

    end = perf_counter()
    print('Testing took {} ms.'.format(int((end - start) * 1000)))

    for l in classification_report(y_true, y_pred).split('\n'):
        print(l)
def pos_per_line(text_file):
    try:
        tokenizer = Tokenizer()
        #pos
        tagger = PerceptronTagger()
        for s in text_file:
            tokens = tokenizer.tokenize(s)
            #print " ".join([" ".join(token)  for token in tagger.tag(tokens)])
            print " ".join([token[1]  for token in tagger.tag(tokens)])
    except:
        print >> sys.stderr, "Error pos_per_line(text_file): ", sys.exc_info()
Example #23
0
def ie_preprocess(document):

    tagger = PerceptronTagger()
    tagged = []
    sentences = document.split("\n")
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    for sent in sentences:
        tagged_tokens = tagger.tag(sent)
        tagged.append(tagged_tokens)

    return tagged
Example #24
0
 def __init__(self, train=False):
     self.tagger = PerceptronTagger()
     self.model = None
     # BOW: triangle, rectangle, circle, hand
     # verbs: draw, wave, rotate
     self.BOW = ['draw', 'wave', 'rotate', 'triangle', 'rectangle', 'circle', 'hand']
     self.VERBS = [wn.synset('draw.v.01'), wn.synset('wave.v.01'), wn.synset('rotate.v.01')]
     self.n_bow, self.n_verbs = len(self.BOW), len(self.VERBS)
     if train: self.train_svm()
     else: self.load_model()
     return
Example #25
0
 def __init__(self, path='../data/', win_size=4):
     self.path = path
     self.win_size = win_size
     self.create_synsets_dictionaries()
     self.create_lemma_unique_synset_dictionary()
     self.synsetid_synsetname_d = None
     self.synsetname_synsetid_d = None
     self.lemma_synsetid_d = None
     self.synsetid_lemma_d = None
     self.word_lemma_d = None
     self.tagger = PerceptronTagger()
     self.lemmatizer = WordNetLemmatizer()
Example #26
0
 def __init__(self, stopwords=[], term_patterns=[], min_term_length=3, min_term_words=2):
     #StopWordsDetector
     self.stopwords = set(stopwords)
     self.min_term_length = min_term_length
     self.term_patterns = term_patterns
     self.min_term_words = min_term_words
     self.detectors = []
     self.pos_tagger=PerceptronTagger()
     for tp in term_patterns:
         self.detectors.append(POSSequenceDetector(tp))
         
     self.swd = StopWordsDetector(self.stopwords)
Example #27
0
 def __init__(self, pretokenizer='moses'):
     self.tagger = PerceptronTagger()
     self.pretok_type = pretokenizer
     if pretokenizer == 'bertpretokenizer':
         self.pretokenizer = BertPreTokenizer()
     elif pretokenizer == 'moses':
         self.pretokenizer = MosesTokenizer()
         self.detokenizer = MosesDetokenizer()
     elif pretokenizer == 'whitespace':
         pass
     else:
         raise ValueError(
             "pretokenizer must be 'bertpretokenizer', 'moses', or 'whitespace'."
         )
Example #28
0
    def __init__(self, df_train, df_valid, df_test, args, unk=True):

        self.dictionary = Dictionary(max_vocab_size=args.max_vocab_size)
        self.unk = unk
        self.classification = args.classification
        self.transfer_learning = args.transfer_learning
        self.max_length = args.n_ctx
        self.pos = args.POS_tags
        self.pos_tags = set()
        self.bpe = args.bpe
        self.lang = args.lang
        if self.bpe:
            self.sp = GPT2Tokenizer.from_pretrained("gpt2",
                                                    add_prefix_space=True)

        if self.pos:
            self.tagger = PerceptronTagger()

        self.tokenize_df(df_train)
        self.tokenize_df(df_valid)
        self.tokenize_df(df_test)

        with open(args.dict_path, 'wb') as file:
            pickle.dump(self.dictionary, file)

        if not self.classification:
            if self.pos:
                self.train, self.train_pos = self.tokenize_(df_train)
                self.valid, self.valid_pos = self.tokenize_(df_valid)
                self.test, self.test_pos = self.tokenize_(df_test)
            else:
                self.train = self.tokenize_(df_train)
                self.valid = self.tokenize_(df_valid)
                self.test = self.tokenize_(df_test)
        else:
            if self.pos:
                self.train, self.train_pos, self.train_target, self.train_keywords, self.train_stemmed_string = self.tokenize_doc(
                    df_train, max_length=self.max_length)
                self.valid, self.valid_pos, self.valid_target, self.valid_keywords, self.valid_stemmed_string = self.tokenize_doc(
                    df_valid, max_length=self.max_length, valid=False)
                self.test, self.test_pos, self.test_target, self.test_keywords, self.test_stemmed_string = self.tokenize_doc(
                    df_test, max_length=self.max_length)
            else:
                self.train, self.train_target, self.train_keywords, self.train_stemmed_string = self.tokenize_doc(
                    df_train, max_length=self.max_length)
                self.valid, self.valid_target, self.valid_keywords, self.valid_stemmed_string = self.tokenize_doc(
                    df_valid, max_length=self.max_length, valid=False)
                self.test, self.test_target, self.test_keywords, self.test_stemmed_string = self.tokenize_doc(
                    df_test, max_length=self.max_length)
Example #29
0
    def _perform_analysis(self, tokenized_sents):
        res = []

        if len(self.precalced_data):
            return self.precalced_data
        else:
            for tokens in tokenized_sents:
                tagger = PerceptronTagger()
                tags = tagger.tag(tokens)

                res += tags

        self.precalced_data = res

        return res
Example #30
0
    def __init__(self, hparams, dataset_type="relocar"):
        self.hparams = hparams
        self.dataset_type = dataset_type

        self._get_labels()
        self._get_word_dict()
        self._pad_idx = self.hparams.pad_idx

        # POS Tagger
        perceptron_tagger = PerceptronTagger()
        self.pos2id = {"<PAD>": 0}
        for pos_tag in list(perceptron_tagger.classes):
            self.pos2id[pos_tag] = len(self.pos2id)
        # Stanford NER
        self.ne2id = {
            "<PAD>": 0,
            "O": 1,
            "LOCATION": 2,
            "ORGANIZATION": 3,
            "PERSON": 4
        }

        if self.hparams.do_bert and not self.hparams.do_roberta and not self.hparams.do_xlnet:
            self._bert_tokenizer_init()
        elif self.hparams.do_bert and self.hparams.do_roberta:
            self._roberta_tokenizer_init()
        elif self.hparams.do_bert and self.hparams.do_xlnet:
            self._xlnet_tokenizer_init()
Example #31
0
    def __init__(self):

        self.tagger = PerceptronTagger()
        self.alarm_ai = AlarmResponse()
        self.converter_ai = ConversionResponse()
        self.summarize_ai = SummarizerResponse()
        self.news_ai = NewsResponse()
        self.maps_ai = MapsResponse()
        self.lemmatizer = WordNetLemmatizer()
        self.parser = spy.English()
        self.conversion = ConversionService()
        self.ai = {
            "alarm_api": self.alarm_ai,
            "unit_conversion": self.converter_ai,
            "summarization": self.summarize_ai,
            "news": self.news_ai,
            "maps_api": self.maps_ai
        }
        self.classifiers = []
        with open('nb_dumped_classifier.pkl', 'rb') as fid:
            self.classifiers.append(cPickle.load(fid))
        with open('sgd_dumped_classifier.pkl', 'rb') as fid:
            self.classifiers.append(cPickle.load(fid))
        with open('pla_dumped_classifier.pkl', 'rb') as fid:
            self.classifiers.append(cPickle.load(fid))
        self.previous_ents = {"PERSON": "", "GPE": ""}
        self.special_tags = {
            'PERSON': ['his', 'her', 'him', 'he', 'she'],
            'GPE': ['it', 'there']
        }
Example #32
0
    def wordTagging(self, sentence):
        posWords = []
        # text = nltk.word_tokenize(sentence)
        tagset = None
        start_time = time.time()
        try:
            tokens = nltk.word_tokenize(sentence)
        except UnicodeDecodeError:
            tokens = nltk.word_tokenize(sentence.decode('utf-8'))

        tagger = PerceptronTagger()
        pos_arrs = nltk.tag._pos_tag(tokens, tagset, tagger)
        for pos_arr in pos_arrs:
            pos = []
            pos.append(pos_arr[0])
            pos.append(pos_arr[1])
            # print pos_arr
            if pos_arr[1] in self.posDictionary:
                ppt = self.posDictionary[pos_arr[1]]
            else:
                ppt = "symbol"

            pos.append(ppt)
            posWords.append(pos)

        return posWords
Example #33
0
def pos_tag(tokens, tagset=None):
    """
    Use NLTK's currently recommended part of speech tagger to
    tag the given list of tokens.

        >>> from nltk.tag import pos_tag
        >>> from nltk.tokenize import word_tokenize
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
        [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
        ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal')
        [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'),
        ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')]

    NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence.

    :param tokens: Sequence of tokens to be tagged
    :type tokens: list(str)
    :param tagset: the tagset to be used, e.g. universal, wsj, brown
    :type tagset: str
    :return: The tagged tokens
    :rtype: list(tuple(str, str))
    """
    tagger = PerceptronTagger()
    return _pos_tag(tokens, tagset, tagger)
Example #34
0
    def __init__(self, url, testrun):
        """Initialize the ShallowPipeline.

        Args:
            url (String)       The Solr URL for the collection
            testrun (Boolean)  True if it is a test run, False if need
                               to index full corpus
        """
        self.solr = index.SolrSearch(url)
        self.testrun = testrun
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.tagger = PerceptronTagger()
        self.dep_parser = StanfordDependencyParser(
            model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
            java_options=u'-mx4g')
    def __init__(self, dbFilePath=None, revDbFilePath=None, pos=False):
        self.dbFilePath = dbFilePath
        self.revDbFilePath = revDbFilePath
        if pos:
            self.tagger = PerceptronTagger()
        self.pos = pos

        # try to open forward database
        if not dbFilePath:
            self.dbFilePath = os.path.join(os.path.dirname(__file__),
                                           "markovdb")
        try:
            with open(self.dbFilePath, 'rb') as dbfile:
                self.db = pickle.load(dbfile)
        except (IOError, ValueError):
            logging.warn(
                'Database file corrupt or not found, using empty database')
            self.db = _db_factory()

        # try to open backwards database
        if not revDbFilePath:
            self.revDbFilePath = os.path.join(os.path.dirname(__file__),
                                              "revmarkovdb")
        try:
            with open(self.revDbFilePath, 'rb') as dbfile:
                self.rev_db = pickle.load(dbfile)
        except (IOError, ValueError):
            logging.warn(
                'Database file corrupt or not found, using empty database')
            self.rev_db = _db_factory()
class HmmSeqRecognizer(object):
  def __init__(self):
    self.hmm_models = []
    self.n_hmm = 0
    self.hmm2idx = {}
    self.idx2hmm = {}
    self.tagger = PerceptronTagger()
    return

  def batch_test(self, samples, label):
    tp,ns = 0,len(samples)
    for i in xrange(ns):
      idx = self.predict_sample(samples[i])
      if idx==label: tp+=1
    return tp,float(tp)/ns

  def predict_sample(self, sample):
    sample = [sample]
    probs = [ model.test(sample) for model in self.hmm_models ]
    return probs.index(max(probs))

  def predict_sentence(self, sentence):
    sample =  [[ tag for _,tag in self.tagger.tag(word_tokenize(sentence)) ]]
    probs = [ model.test(sample) for model in self.hmm_models ]
    return probs.index(max(probs))

  def add_model(self, name, model):
    self.hmm_models.append(model)
    self.hmm2idx[name] = self.n_hmm
    self.idx2hmm[self.n_hmm] = name
    self.n_hmm += 1

  def new_hmm(self, name, datapath, nhs, ne):
    print '=> adding HMM model \'%s\'...' % name
    hmm_model = HmmModel(nhs)
    hmm_model.train(datapath,ne)
    self.add_model(name, hmm_model)
    print '|  done'
    return

  def save_hmm(self, name, hmm_path):
    print '=> saving HMM model \'%s\'...' % name
    f = open(hmm_path, 'wb')
    pickle.dump(self.hmm_models[self.hmm2idx[name]], f)
    f.close()
    print '|  done'
    return

  def load_hmm(self, name, hmm_path):
    # print '=> adding HMM model \'%s\'...' % name
    f = open(hmm_path, 'rb')
    hmm_model = pickle.load(f)
    f.close()
    self.add_model(name, hmm_model)
    # print '|  done'
    return
def pos_title_abstract(resource):
    xml_file = resource[0]
    pos_data = []
    try:
        xmldoc = minidom.parse(xml_file)
        elements_title = xmldoc.getElementsByTagName("Title")
        title =  elements_title.item(0).childNodes[0].nodeValue
        elements_list = xmldoc.getElementsByTagName("S")
        sentences = [e[0].nodeValue for e in [element.childNodes for element in elements_list] if e[0].nodeType == e[0].TEXT_NODE]
        sentences.insert(0, title)
        #raw text 
        txt_file = change_file_extention(resource[1], "xml", "txt")
        save_to_file(txt_file, sentences)
        #pos
        tagger = PerceptronTagger()
        for s in sentences:
            tokens = word_tokenize(s)
            pos_data.append(tagger.tag(tokens))
        pos_file = change_file_extention(resource[1], "xml", "pos")
        save_to_file(pos_file, pos_data)
    except:
        print >> sys.stderr, "Error pos_title_abstract:", resource, sys.exc_info()

    return pos_data 
Example #38
0
class NERTagger:
    def __init__(self):
        self.pos_tagger = PerceptronTagger()

    def tag(self, tokens):
        tree = nltk.ne_chunk(self.pos_tagger.tag(tokens))
        tagged_tokens = []
        for t in tree:
            if type(t) == nltk.tree.Tree:
                label = t.label()
                for token in t:
                    tagged_tokens.append((token[0], label))
            else:
                tagged_tokens.append(t)
        return tagged_tokens
class ActionDetection(object):
    def __init__(self, train=False):
        self.tagger = PerceptronTagger()
        self.model = None
        # BOW: triangle, rectangle, circle, hand
        # verbs: draw, wave, rotate
        self.BOW = ['draw', 'wave', 'rotate', 'triangle', 'rectangle', 'circle', 'hand']
        self.VERBS = [wn.synset('draw.v.01'), wn.synset('wave.v.01'), wn.synset('rotate.v.01')]
        self.n_bow, self.n_verbs = len(self.BOW), len(self.VERBS)
        if train: self.train_svm()
        else: self.load_model()
        return

    def save_model(self):
        f = open(MODEL_PATH + 'action_detection.model', 'wb')
        pickle.dump(self.model, f)
        f.close()
        return

    def train_svm(self):
        with open(DATA_PATH+'action_detection_training_set.txt') as f:
            data = f.readlines()
        X, y = [],[]
        for line in data:
            line = line.strip()
            if not line: continue
            line = line.split(' ',1)
            X.append(self.extract_feature(line[1]))
            y.append(int(line[0]))
        lin_clf = svm.LinearSVC()
        lin_clf.fit(X, y)
        self.model = lin_clf
        self.save_model()
        return

    def load_model(self):
        f = open(MODEL_PATH + 'action_detection.model', 'rb')
        self.model = pickle.load(f)
        f.close()
        return

    def extract_feature(self, sent):
        feature = [0] * (self.n_bow+self.n_verbs)
        verbs = [ w for w,pos in self.tagger.tag(word_tokenize(sent)) if pos=='VB' ]
        words = set(sent.split())
        for i in xrange(self.n_bow):
            feature[i] = 1 if self.BOW[i] in words else 0
        for i in xrange(self.n_verbs):
            if not verbs:
                feature[self.n_bow+i] = 0
            else:
                similarities = [ wn.path_similarity(self.VERBS[i],wn.synset(v+'.v.01')) for v in verbs ]
                feature[self.n_bow+i] = max(similarities)
        return feature

    def predict(self, sent):
        # classes: 0(rectangle), 1(circle), 2(triangle), 3(wave), 4(rotate)
        feature = self.extract_feature(sent)
        idx = self.model.predict([feature])[0]
        probs = self.model._predict_proba_lr([feature])[0]
        # return value: 0(none), 1-5(classes+1)
        if probs[idx]>CONFIDENCE_THRESHOLD: return idx+1
        else: return 0
if __name__ == "__main__":
    try:
        debug = True if sys.argv[-1] == "debug" else False
        debug_tests = 3
        file_count = 0

        dir_corpus = sys.argv[1]
       
        extra_features = True
        qr = mdbcl.QueryResources()

        without_types = False

        tokenizer = Tokenizer()
        #pos
        tagger = PerceptronTagger()

        train_sents = []

        for (dirname, _, filenames) in os.walk(dir_corpus):
            for f in filenames:
                ext = f[-4:]
                if ext == '.ann':
                    file_count += 1
                    if debug and file_count > debug_tests:
                        break
                    
                    file_ann = os.path.join(dirname, f[:-4] + ".ann")
                    ann_file = open(file_ann, "r")
                    #file_ann_ext = os.path.join(dir_output, f[:-4] + ".anne")
                    #ann_ext_file = open(file_ann_ext, "w")
# input_filename="removed2_en.csv"
input_filename="sample_removed2_en.csv"
output_filename="sample_tag_nltk_en.csv"
output=open(output_filename,'w')

# WAY-1 200s/1000
# with open(input_filename) as data_file:
# 	for (index,line) in enumerate(data_file):
# 		line=line.decode('utf-8','ignore') # you have to decode the line using the corresponded coding!
# 		sents = nltk.sent_tokenize(line)
# 		nltk.pos_tag_sents(sents) # WRONG!!!!
# 		print index

# WAY-2: Faster  20s/1000
tagger = PerceptronTagger() 
with open(input_filename) as data_file:
	for (index,line) in enumerate(data_file):
		line=line.decode('utf-8','ignore') # you have to decode the line using the corresponded coding!
		# sents = nltk.sent_tokenize(line)
		# print sents
		# sentences_pos=tagger.tag_sents(sents)
		word_list=nltk.word_tokenize(line)
		line_tagged=tagger.tag(word_list)
		if index in range(5000,60001,5000):
			print index
		# print line_tagged
		for t in line_tagged:
			output.write('_'.join(t)+' ')
		output.write('\n')
if __name__ == "__main__":
    try:
        debug = True if sys.argv[-1] == "debug" else False
        debug_tests = 3
        file_count = 0

        dir_corpus = sys.argv[1]       
        dir_output = sys.argv[2]
        try:
            training_crfsuite = sys.argv[3]
        except:
            training_crfsuite = 'keyphrase.crfsuite'

        tokenizer = Tokenizer()
        #pos
        tagger = PerceptronTagger()

        extra_features = True
        qr = mdbcl.QueryResources()

        crftagger = pycrfsuite.Tagger()
        crftagger.open(training_crfsuite)

        #test_sents = []
        for (dirname, _, filenames) in os.walk(dir_corpus):
            for f in filenames:
                ext = f[-4:]
                if ext == '.ann':
                    file_count += 1
                    if debug and file_count > debug_tests:
                        break
Example #43
0
 def __init__(self):
     self.pos_tagger = PerceptronTagger()
if __name__ == "__main__":
    try:
        debug = True if sys.argv[-1] == "debug" else False
        debug_tests = 3
        file_count = 0

        dir_corpus = sys.argv[1]       
        dir_output = sys.argv[2]
        try:
            training_crfsuite = sys.argv[3]
        except:
            training_crfsuite = 'keyphrase.crfsuite'

        tokenizer = Tokenizer()
        #pos
        tagger = PerceptronTagger()

        crftagger = pycrfsuite.Tagger()
        crftagger.open(training_crfsuite)

        extra_features = True
        qr = mdbcl.QueryResources()

        #test_sents = []
        for (dirname, _, filenames) in os.walk(dir_corpus):
            for f in filenames:
                ext = f[-4:]
                if ext == '.ann':
                    file_count += 1
                    if debug and file_count > debug_tests:
                        break
        try:
            dir_corpus = sys.argv[1]
            dir_output = sys.argv[2]
        except:
            print >> sys.stderr, "E) Directories: ", sys.exc_info()

        try:
            pos_sequences_filename = sys.argv[3]
            count_limit = int(sys.argv[4])
            pos_sequences, is_posregex = kpcommon.get_pos_tags_by_count(pos_sequences_filename, count_limit)
        except:
            print >> sys.stderr, "E) Common tags: ", sys.exc_info()

        qr = mdbcl.QueryResources()
        tokenizer = Tokenizer()
        tagger = PerceptronTagger()
        lemmatizer = WordNetLemmatizer()
        stemmer = LancasterStemmer()
        
        for (dirname, _, filenames) in os.walk(dir_corpus):
            for f in filenames:
                ext = f[-4:]
                if ext == '.txt':
                    file_count += 1 #debug
                    if debug and file_count > debug_tests: #debug
                        break #debug
                    print file_count, f[:-4]
                    try:
                        file_text = os.path.join(dirname, f[:-4] + ".txt")
                        text_file = open(file_text, "r")
                        raw_text = unicode(text_file.read(), encoding="utf-8")
#!/usr/bin/python 
import sys
import os
from nltk.tokenize import TreebankWordTokenizer as Tokenizer
from nltk.tag.perceptron import PerceptronTagger
import operator

if __name__ == "__main__":
    try:
        dir_corpus = sys.argv[1]
        dir_output = sys.argv[2]
        
        tokenizer = Tokenizer()
        #pos
        tagger = PerceptronTagger()

        for (dirname, _, filenames) in os.walk(dir_corpus):
            for f in filenames:
                ext = f[-4:]
                if ext == '.ann':
                    file_ann = os.path.join(dirname, f[:-4] + ".ann")
                    ann_file = open(file_ann, "r")
                    file_ann_ext = os.path.join(dir_output, f[:-4] + ".anne")
                    ann_ext_file = open(file_ann_ext, "w")
                    print f[:-4]
                    indexes_kp_tmp = {}
                    for ann in ann_file:
                        ann = unicode(ann, encoding="utf-8")
                        if ann[0] not in ["R", "*"]:
                            ann_items = ann.strip().split("\t")
                            if ann_items[1].find(";") >= 0:
import nltk
from nltk.tag.perceptron import PerceptronTagger

# nltk.download()

sentence = """At eight o'clock on Thursday morning... Arthur didn't feel very good."""
tokens = nltk.word_tokenize(sentence)

tagger = PerceptronTagger(False)
tagger.load('file:///C:/Users/yarov/nltk_data/taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle')
tagged = tagger.tag(tokens)
print tagged
Example #48
0
(c) R. Loth ISCPIF-CNRS (UPS 3611)
"""

from re   import sub,search
from glob import glob
from nltk import sent_tokenize
from nltk import word_tokenize
# from nltk import pos_tag
from nltk.tag.perceptron    import PerceptronTagger
from nltk import RegexpParser

from json import load, dump

from datetime import datetime

tagr = PerceptronTagger()


doc_metas = {}
with open('work/meta/test.json') as metas:
    doc_metas = load(metas)

doc_paths = glob("work/corpus/01-originaux/*")

# total par terme
term_totals = {}

# nombre de docs ayant le terme par terme
term_ndocs = {}

# nombre de tokens dans le terme
Example #49
0
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib
import numpy as np
import nltk
import re
from nltk.tag.perceptron import PerceptronTagger
from sklearn import cross_validation
import os

dir_path = os.path.dirname(os.path.abspath(__file__))

file_path = os.path.join(dir_path, 'Data/LowerTestData.txt')
classifier_path = os.path.join(dir_path, 'Classifier/Classifier.pkl')

tagger = PerceptronTagger()
classifier = Pipeline([
    ('vectorizer', CountVectorizer(min_df=1, max_features=100, lowercase=False, decode_error='strict')),
    ('clf', OneVsRestClassifier(LinearSVC()))])
print "Classifier pipeline initialized."
labels = []
data = []
print "Processing the training file."
with open (file_path, 'r') as readFile :
    for row in readFile :
        arr = []
        sentences = []
        arr = row.split(",,,")
        arr[1] = arr[1].strip(' ')
        arr[1] = arr[1].strip('\n')
        labels.append(arr[1])
import numpy as np
import nltk
from nltk.tag.perceptron import PerceptronTagger
from sklearn.preprocessing import label_binarize
from sklearn.naive_bayes import MultinominalNB
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from matplotlib.pyplot import pyplot as plt

tagger = PerceptronTagger()
name_tag = []
non_name_tag = []
full_names = []
non_names = []

with open('full_names.txt') as infile:
    full_names = infile.read().splitlines()

with open('non_names.txt') as infile:
    non_names = infile.read().splitlines()

for x in full_names:
    tags = nltk.ne_chunk(tagger.tag(x.split())).pos()
    name_tag.append(tags[0][0][1]+' '+tags[0][1]+' '+tags[1][0][1]+' '+tags[1][1])

for x in non_names:
    tags = nltk.ne_chunk(tagger.tag(x.split())).pos()
    non_name_tag.append(tags[0][0][1]+' '+tags[0][1]+' '+tags[1][0][1]+' '+tags[1][1])

name_tag = [w.split() for w in name_tag]
non_name_tag = [w.split() for w in non_name_tag]
import cowparser as cp

train_sents = []
test_sents = []

gen = cp.sentences_for_dir(separate=False)
for i, (metadata, data) in enumerate(gen):
    train_sents.append([(a,b) for a,b,c in data])
    if i == 2000000:
        break

for i, (metadata, data) in enumerate(gen):
    test_sents.append([(a,b) for a,b,c in data])
    if i == 5000:
        break

from nltk.tag.perceptron import PerceptronTagger
pt = PerceptronTagger(load=False)
pt.train(train_sents,'model2.perc.dutch_tagger')
print(pt.evaluate(test_sents))
Example #52
0
		title= title.replace('\n','')
		titles+=[title]
		bodies+=[standard_body(body)]
	else:
		titles+=['Unknown']
		bodies+=[standard_body(story)]
		
### -------------------------- ###
#      Tokenize and tag text     #
### -------------------------- ###
stories = pd.DataFrame({'title':titles,'body':bodies})
stories['sents'] = stories['body'].map(lambda x: nltk.sent_tokenize(x))
stories['words'] = stories['sents'].map(lambda x: [[w.lower() for w in nltk.word_tokenize(s)] for s in x])

from nltk.tag.perceptron import PerceptronTagger
tagger = PerceptronTagger()
stories['tags'] = stories['words'].map(lambda x: [tagger.tag(s) for s in x])

### -------------------------- ###
#  Categorize tags into genres   #
### -------------------------- ###
# Note: Standards replace 'TG' with '{CLASS}' i.e. 'NN' -> '{ANML}'
""" Genres:
 - occupation
 - animals: is_animal
 - body parts: 
 - exclamation
 - food
 - location
"""
is_animal = lambda x: is_hyper_of(x,'animal')