def __init__(self, f, keywords):
        s = f.read()
        self.keywords = keywords
        self.file = s
        self.sentences = sent_tokenize(s)
        self.parser = StanfordParser(
            "stanford-parser-full-2014-08-27/stanford-parser",
            "stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models")
        self.tagger = st.StanfordPOSTagger(
            "stanford-postagger-full-2014-08-27/models/french.tagger",
            "stanford-postagger-full-2014-08-27/stanford-postagger.jar")
        self.ner = st.StanfordNERTagger(
            "stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz",
            "stanford-ner-2014-08-27/stanford-ner.jar")

        self.trees = []
        for sent in self.sentences:
            try:
                self.trees.append(self.parser.raw_parse(sent))
            except OSError:
                self.trees.append([])
        self.words = self.word_tokenize_without_punc(s)
        self.stemmer = FrenchStemmer()
        self.stems = [self.stemmer.stem(w) for w in self.words]
        self.words_sentences = [
            self.word_tokenize_without_punc(s) for s in self.sentences
        ]
        self.tags = self.tagger.tag(self.words)
        self.tags_sentences = [
            self.tagger.tag([w for w in self.words_sentences[i]])
            for i in range(len(self.sentences))
        ]
        self.entities = self.ner.tag(self.words)
        self.entities_sentences = [
            self.ner.tag([w for w in self.words_sentences[i]])
            for i in range(len(self.sentences))
        ]
        self.left_subject = defaultdict(lambda: 0)
        self.left_compl = defaultdict(lambda: 0)
        self.left_neg_subject = defaultdict(lambda: 0)
        self.left_neg_compl = defaultdict(lambda: 0)
        self.right_subject = defaultdict(lambda: 0)
        self.right_compl = defaultdict(lambda: 0)
        self.right_neg_subject = defaultdict(lambda: 0)
        self.right_neg_compl = defaultdict(lambda: 0)
        self.left_ref = 0
        self.right_ref = 0
        self.trees_leaves = []
        for e in self.trees:
            res = []
            extract_leaves(list(e)[0], res)
            self.trees_leaves.append(tuple_to_dict(res))
        self.extract_keywords()
Example #2
0
def extract_words_plus_pos_tags(texts, lang):
    results = []
    if lang in stanford_lang_models:
        import nltk.tag.stanford as stanford_tagger
        tagger = stanford_tagger.StanfordPOSTagger(
            stanford_res_path + stanford_lang_models[lang],
            path_to_jar=stanford_res_path + "stanford-postagger.jar")
        results = tagger.tag(word_tokenize(texts, language=lang_map[lang]))
        if lang == 'en':  # convert eng tags to universal tags
            results = [(word, map_tag('en-ptb', 'universal', tag))
                       for word, tag in results]

    return results
Example #3
0
    def train(self):
        # _path_to_model = home + '/stanford-postagger/models/english-bidirectional-distsim.tagger'
        _path_to_model = home + '../../stanford-postagger/models/english-left3words-distsim.tagger'
        _path_to_jar = home + '../../stanford-postagger/stanford-postagger.jar'

        self.st = stanford.StanfordPOSTagger(_path_to_model, _path_to_jar)
Example #4
0
from nltk.corpus import stopwords

os.environ[
    'STANFORD_MODELS'] = 'stanford-segmenter-2018-10-16/data/;stanford-postagger-full-2018-10-16/models/'
os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2018-10-17'
os.environ['CLASSPATH'] = 'stanford-parser-full-2018-10-17'
os.environ['JAVAHOME'] = 'C:/Program Files/Java/jdk-11.0.1'

segmenter = StanfordSegmenter(
    'stanford-segmenter-2018-10-16/stanford-segmenter-3.9.2.jar')
segmenter.default_config('ar')
text = segmenter.segment_file('sample.txt')
print(text)

tagger = STag.StanfordPOSTagger(
    'arabic.tagger',
    'stanford-postagger-full-2018-10-16/stanford-postagger.jar')
for tag in tagger.tag(text.split()):
    print(tag[1])

parser = SParse.StanfordParser(
    model_path='edu/stanford/nlp/models/lexparser/arabicFactored.ser.gz')
sentences = parser.raw_parse_sents(text.split('.'))
for line in sentences:
    for sentence in line:
        print(sentence)
        sentence.draw()

ner = Text(text)
for sent in ner.sentences:
    print(sent)
Example #5
0
import os
import sys
sys.path.append("../")
import config.data_config as data_cfg
import utils.gen_utils as gen_utils
import utils.data_utils as data_utils
from nltk.tag import stanford
import threading
import nltk

lst_fn=data_cfg.all_list_fn

candidate_list=gen_utils.read_lines_from_text_file(data_cfg.cad_lst.fn)

pos_tagger = stanford.StanfordPOSTagger(os.path.join(data_cfg.root_path,"english-bidirectional-distsim.tagger"))

def tag_pos(qid):
    qtokens = gen_utils.read_dict_from_pkl(os.path.join(data_cfg.root_path,qid & "_tokens.pkl"))
    qpos = pos_tagger.tag(qtokens)

    pos_list = []
    for q in qpos:
        tok, pos = q
        pos_list.append(pos)

    pos_fn=os.path.join(feat_root_path,str(qid)+".pkl")
    gen_utils.write_dict_to_pkl(pos_list,pos_fn)

    return

if __name__ == '__main__':
    'STANFORD_MODELS'] = 'C:\\Users\\snmuj\\OneDrive\\Documents\\salm\\stanford-segmenter-2018-10-16\\data;C:\\Users\\lenovo\\Documents\\salm\\stanford-postagger-full-2018-10-16\\models'
os.environ[
    'STANFORD_PARSER'] = 'C:\\Users\\snmuj\\OneDrive\\Documents\\salm\\stanford-parser-full-2018-10-17'
os.environ[
    'CLASSPATH'] = 'C:\\Users\\snmuj\\OneDrive\\Documents\\stanford-parser-full-2018-10-17'
os.environ['JAVAHOME'] = 'C:\Program Files\Java\jdk-14.0.2_windows-x64_bin.exe'

segmenter = StanfordSegmenter(
    'C:\\Users\\snmuj\\OneDrive\\Documents\\salm\\stanford-segmenter-2018-10-16\\stanford-segmenter-3.9.2.jar'
)
segmenter.default_config('ar')
text = segmenter.segment_file('text file')
print(text)

tagger = STag.StanfordPOSTagger(
    'arabic.tagger',
    'C:\\Users\\snmuj\\OneDrive\\Documents\\stanford-postagger-full-2018-10-16\\stanford-postagger.jar'
)
for tag in tagger.tag(text.split()):
    print(tag[1])

parser = SParse.StanfordParser(
    model_path='edu/stanford/nlp/models/lexparser/arabicFactored.ser.gz')
sentences = parser.raw_parse_sents(text.split('.'))
for line in sentences:
    for sentence in line:
        print(sentence)
sentence.draw()

from polyglot.text import Text
text = Text(text)
for sent in text.sentences:
Example #7
0
def extract_component(f, keywords):
    # Apply each function
    name = f.name
    fe = FeaturesExtractor(f, keywords)
    data = fe.extract_features()

    # Write to file
    new_file = open("features/" + name[6:], "w")
    new_file.write('\n'.join([str(e) for e in data]))


def extract_keywords(tagger):
    left, right = left_vs_right_keywords(tagger)
    left = [e[0] for e in left if len(e[0]) > 3]
    right = [e[0] for e in right if len(e[0]) > 3]
    left = left[:50]
    right = right[:50]
    return list(set().union(left, right))


if __name__ == "__main__":
    tagger = st.StanfordPOSTagger(
        "stanford-postagger-full-2014-08-27/models/french.tagger",
        "stanford-postagger-full-2014-08-27/stanford-postagger.jar")
    files = glob.glob("train/*")
    keywords = extract_keywords(tagger)
    print(keywords)
    for i in range(len(files)):
        print("{} / {}...".format(i + 1, len(files)))
        extract_component(open(files[i], "r"), keywords)
Example #8
0
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
import numpy
from sklearn import svm
import sys
from nltk.tag import stanford
#from nltk.tag.corenlp import CoreNLPPOSTagger

# this is all done a bit haphazardly, seeing if using postagbigrams improves performance. If it does, do this properly, probably by taking the parse trees that I already generated for connective identification. I'm hoping/assuming here that the parser will assign the same/very similar postags as the pos tagger does.

# Add the jar and model via their path (instead of setting environment variables):
jar = '/home/peter/phd/PhdPlayground/stanfordStuff/stanford-postagger-full-2018-02-27/stanford-postagger.jar'
model = '/home/peter/phd/PhdPlayground/stanfordStuff/stanford-postagger-full-2018-02-27/models/german-fast.tagger'

pos_tagger = stanford.StanfordPOSTagger(model, jar, encoding='utf8')

#headers = ['id', 'connective', 'pos', 'sentencePosition', 'pathToRoot', 'class_label'
"""
pos2column = {
    0:'connective',
    1:'pos',
    2:'sentencePosition',
    3:'pathToRoot',
    4:'class_label'
}

"""
pos2column = {}