Example #1
0
def segment_en(texts, flag_keep_number=False):
    tk = StanfordTokenizer()
    results = {}
    for text in texts:
        if flag_keep_number:
            words = tk.tokenize(text)
        else:
            words = map(replace_number, tk.tokenize(text))
        segmented = ' '.join(words).lower()
        results[text] = segmented
    return results
def tokenize_q(qa, phase):
    qas = len(qa)
    MyTokenizer = StanfordTokenizer()
    for i, row in enumerate(tqdm(qa)):
        row['question_toked'] = MyTokenizer.tokenize(
            row['question'].lower())[:14]
        if i % 50000 == 0:
            json.dump(qa,
                      open('vqa_' + phase + '_toked_' + str(i) + '.json', 'w'))
        if i == qas - 1:
            json.dump(qa, open('vqa_' + phase + '_toked.json', 'w'))
Example #3
0
def data():
    with open("wonderland.txt", "r", encoding="utf-8-sig") as file:
        return [
            word.lower() for word in StanfordTokenizer(
                path_to_jar=
                r"C:\stanford-postagger-2016-10-31\stanford-postagger.jar",
                options={
                    "normalizeParentheses": "false",
                    "normalizeOtherBrackets": "false"
                }).tokenize(file.read())
        ]
    def __init__(self, task_queue, result_queue):

        multiprocessing.Process.__init__(self)
        self.task_queue = task_queue
        self.result_queue = result_queue

        self.tokenizer = StanfordTokenizer(options={"ptb3Escaping": True})
        print '%s: Loading pickles...' % self.name
        self.map_word_index = pickle.load(
            open('preproc/map_word_index.pkl', 'r'))
        print '%s: Done.' % self.name
Example #5
0
def Tokenize_stopwords_stemmer(texts):
    #print time()
    #用斯坦福的分词采用这一段,用普通分词时不用这个
    #tokenize
    Str_texts = texts[0]
    print os.getcwd()
    #tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar")
    tokenizer = StanfordTokenizer(path_to_jar=r"stanford-parser.jar")
    texts_tokenized = tokenizer.tokenize(Str_texts)  #输入必须是字符串
    #print time()
    p2 = r'.+[-_\./"].+'
    pa2 = re.compile(p2)
    texts_filtered = []
    for document in texts_tokenized:
        if document in pa2.findall(document):
            if document.find('_') > -1:
                texts_filtered = texts_filtered + document.split('_')
            elif document.find('-') > -1:
                texts_filtered = texts_filtered + document.split('-')
            elif document.find('.') > -1:
                texts_filtered = texts_filtered + document.split('.')
        else:
            texts_filtered.append(document)
    #print time()
    p1 = r'[-@<#$%^&*].+'
    pa1 = re.compile(p1)
    p3 = r'.+">'
    pa3 = re.compile(p3)
    english_stopwords = stopwords.words('english')  #得到停词
    english_punctuations = [
        ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#',
        '$', '%', '\n', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<',
        '>', '/', '\"', '\'', '{', '}', '!', '~', '`', '$', '^', '/*', '*/',
        '/**', '**/', '**', '-', '_', '+', '=', r'-?-', r'@?'
    ]  #得到标点
    texts_filtered0 = []
    for document in texts_filtered:
        if document in pa1.findall(document) or document in pa3.findall(
                document
        ) or document == '' or document == "''" or document == "``" or document in english_stopwords or document in english_punctuations:
            pass
        else:
            texts_filtered0.append(document)
    #print time()

    porter = nltk.PorterStemmer()
    texts_Stemmered = [porter.stem(t) for t in texts_filtered0]  #列表类型
    #print time()

    return texts_Stemmered  #返回一个列表
Example #6
0
def readwordarr(isTokenize=True):
    posWords = []
    negWords = []
    stopwords = getstopword()
    if isTokenize:
        tokenizer = StanfordTokenizer()
        with open(negfilepath, 'r', encoding='utf-8') as sentences:
            arr = tokenizer.tokenize(sentences.read())
            for line in arr:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                negWords.append(list(wordset))
        with open(posfilepath, 'r', encoding='utf-8') as sentences:
            arr = tokenizer.tokenize(sentences.read())
            for line in arr:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                posWords.append(list(wordset))
    else:
        with open(negfilepath, 'r', encoding='utf-8') as sentences:
            lines = sentences.readlines()
            for line in lines:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                negWords.append(list(wordset))
        with open(posfilepath, 'r', encoding='utf-8') as sentences:
            lines = sentences.readlines()
            for line in lines:
                linearr = line.split()
                wordset = set()
                for word in linearr:
                    if word in stopwords:
                        continue
                    wordset.add(word)
                posWords.append(list(wordset))
    return posWords, negWords
Example #7
0
def _get_sentence_embeddings(sentences,
                             ngram='bigrams',
                             model='concat_wiki_twitter'):
    """ Returns a numpy matrix of embeddings for one of the published models. It
    handles tokenization and can be given raw sentences.
    Arguments:
        - ngram: 'unigrams' or 'bigrams'
        - model: 'wiki', 'twitter', or 'concat_wiki_twitter'
        - sentences: a list of raw sentences ['Once upon a time', 'This is another sentence.', ...]
    """
    wiki_embeddings = None
    twitter_embbedings = None
    tokenized_sentences_NLTK_tweets = None

    tokenized_sentences_SNLP = None
    if model == "wiki" or model == 'concat_wiki_twitter':
        tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
        s = ' <delimiter> '.join(
            sentences)  #just a trick to make things faster
        tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])
        tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(
            ' <delimiter> ')
        assert (len(tokenized_sentences_SNLP) == len(sentences))
        if ngram == 'unigrams':
            wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                     MODEL_WIKI_UNIGRAMS, FASTTEXT_EXEC_PATH)
        else:
            wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                     MODEL_WIKI_BIGRAMS, FASTTEXT_EXEC_PATH)
    if model == "twitter" or model == 'concat_wiki_twitter':
        tknzr = TweetTokenizer()
        tokenized_sentences_NLTK_tweets = tokenize_sentences(tknzr, sentences)
        if ngram == 'unigrams':
            twitter_embbedings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_NLTK_tweets, \
                                     MODEL_TWITTER_UNIGRAMS, FASTTEXT_EXEC_PATH)
        else:
            twitter_embbedings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_NLTK_tweets, \
                                     MODEL_TWITTER_BIGRAMS, FASTTEXT_EXEC_PATH)
    if model == "twitter":
        return twitter_embbedings
    elif model == "wiki":
        return wiki_embeddings
    elif model == "concat_wiki_twitter":
        return np.concatenate((wiki_embeddings, twitter_embbedings), axis=1)
    sys.exit(-1)
def tokenize_stopwords_stemmer(texts):
    Str_texts = texts[0]
    # tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar")
    tokenizer = StanfordTokenizer(
        path_to_jar=r"C:\Users\zw\Desktop\stanford-parser.jar")  # path_to_jar: 用来定位jar包,r是防止字符转义的,如果路径中出现'\t'的话 不加r的话\t就会被转义 而加了'r'之后'\t'就能保留原有的样子
    java_path = 'E:soft/Java/jdk1.8.0_121/bin/java.exe'
    os.environ['JAVAHOME'] = java_path
    texts_tokenized = tokenizer.tokenize(Str_texts)  # 输入必须是字符串,进行分词
    # print(texts_tokenized)

    p1 = r'[-@<#$%^&*].+'
    pa1 = re.compile(p1)  # re.compile()函数,将正则表达式的字符串形式编译为Pattern实例,然后使用Pattern实例处理文本并获得匹配结果(一个Match实例)
    texts_filtered0 = [document for document in texts_tokenized if not document in pa1.findall(document)]

    p2 = r'.+[-_\/].+'  # 将r'.+[-_\./].+'改为r'.+[-_\/].+',可以保留数字间的句号,比如保留3.1.2这样的格式
    pa2 = re.compile(p2)
    texts_filtered = []
    for document in texts_filtered0:
        if document in pa2.findall(document):
            if document.find('_') > -1:  # split():拆分字符串。通过指定分隔符对字符串进行切片,并返回分割后的字符串列表(list)
                texts_filtered = texts_filtered + document.split('_')
            elif document.find('-') > -1:
                texts_filtered = texts_filtered + document.split('-')
            elif document.find('.') > -1:
                texts_filtered = texts_filtered + document.split('.')
            elif document.find('/') > -1:
                texts_filtered = texts_filtered + document.split('/')
        else:
            texts_filtered.append(document)

    texts_filtered = [document for document in texts_filtered if
                      document != '' and document != "''" and document != "``"]  # 过滤掉空格,单引号和--

    # # stopwords
    # english_stopwords =stopwords.words('english')  # 得到停词
    # texts_filtered_stopwords = [document for document in texts_filtered if not document in english_stopwords]  # 过滤掉停词

    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '\n', '||',
                            '<', '>', '/', '\"', '\'', '{', '}', '!', '~', '`', '0', '$', '^', '/*', '*/', '/**', '**/',
                            '**', '-', '_', '__', '|', '+', '=', r'-?-', r'@?']  # 得到标点

    texts_filtered = [document for document in texts_filtered if
                      not document in english_punctuations]  # 过滤掉标点
    return texts_filtered
Example #9
0
    def __init__(self, user_dict=None):
        self.conf_io = conf.load("io")
        self.conf_corenlp = conf.load("stanford_corenlp")
        self.conf_embedding = conf.load("embedding")
        conf_tokenizer = self.conf_corenlp["tokenizer"]
        conf_postagger = self.conf_corenlp["postagger"]
        prefix = self.conf_corenlp["prefix"]

        self.enTokenizer = StanfordTokenizer(
            path_to_jar=prefix + conf_tokenizer["path_to_jar"]
        )
        self.zh_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_zh"],
            path_to_jar=prefix + conf_postagger["path_to_jar"]
        )
        self.en_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_en"],
            path_to_jar=prefix + conf_postagger["path_to_jar"]
        )
Example #10
0
def get_sentence_embeddings(sentences, ngram='uni'):
    tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
    s = ' <delimiter> '.join(sentences)  #just a trick to make things faster
    tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])
    tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(
        ' <delimiter> ')
    if len(tokenized_sentences_SNLP) != len(sentences):
        print('SENT2VEC TOKENIZATION FAILED')
        tokenized_sentences_SNLP = sentences
    #assert(len(tokenized_sentences_SNLP) == len(sentences))
    if ngram == 'uni':
        embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                 MODEL_TORONTOBOOKS_UNIGRAMS, FASTTEXT_EXEC_PATH)
    elif ngram == 'bi':
        embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                 MODEL_TORONTOBOOKS_BIGRAMS, FASTTEXT_EXEC_PATH)
    else:
        raise (NotImplementedError)
    return embeddings
Example #11
0
def par_tokenize(text_list,
                 clean_html=False,
                 tokenizer="twitter",
                 remove_reps=True,
                 spell_correct=True):
    if tokenizer == "stanford":
        tolkenizer_obj = StanfordTokenizer()
    elif tokenizer == "twitter":
        tolkenizer_obj = TweetTokenizer()
    else:
        tolkenizer_obj = StringTokenizer()

    import multiprocessing as mp
    from functools import partial
    pool = mp.Pool(NUM_PROC)
    tolkenize_func = partial(__tolkenize_text_blob,
                             clean_html=clean_html,
                             remove_reps=remove_reps,
                             spell_correct=spell_correct,
                             tolkenizer_obj=tolkenizer_obj)
    token_list = pool.map(tolkenize_func, text_list)
    return token_list
def stanford_tokenizer(str):

    tokenizer = StanfordTokenizer(
        path_to_jar=
        'D:/software/stanford-parser-full-3.7/stanford-parser-3.7.0-models.jar'
    )

    # sent = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
    return tokenizer.tokenize(str)


# if __name__=='__main__':
#     sent = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
#     result = stanford_tokenizer(sent)
#     print(result)

# st = StanfordPOSTagger('english-bidirectional-distsim.tagger')

# from nltk.tokenize import StanfordTokenizer
# s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
# StanfordTokenizer().tokenize(s)
# s = "The colour of the wall is blue."
# StanfordTokenizer(options={"americanize": True}).tokenize(s)
def tokenize_and_save_corpus(corpus_filename, new_filename):
    with open(corpus_filename, 'r') as f:
        corpus_str = f.read()
    tokenized = StanfordTokenizer().tokenize(corpus_str)
    lowered = [w.lower() for w in tokenized]

    num = r'(?<!\S)(\d*\.?\d+|\d{1,3}(,\d{3})*(\.\d+)?)(?!\S)'
    number_words = {}
    new_words = []
    for word in lowered:
        if word in number_words:
            new_words.extend(number_words[word])
        else:
            numbers = re.findall(num, word)
            if numbers:
                number = numbers[0][0]
                nwords = word_numbers(number)
                number_words[word] = nwords
                new_words.extend(nwords)
            else:
                new_words.append(word)
    with open(new_filename, 'w') as f:
        f.write(' '.join(new_words).encode('utf-8'))
Example #14
0
def tokenize(text_list,
             clean_html=False,
             tokenizer="twitter",
             remove_reps=True,
             spell_correct=True):
    if tokenizer == "stanford":
        tolkenizer_obj = StanfordTokenizer()
    elif tokenizer == "twitter":
        tolkenizer_obj = TweetTokenizer()
    else:
        tolkenizer_obj = StringTokenizer()

    token_list = []
    for text in text_list:
        if clean_html:
            text = BeautifulSoup(text).get_text()
        if remove_reps:
            text = re.sub(r'(.)\1{2,}', r'\1\1', text)
        tokens = tolkenizer_obj.tokenize(text)
        if spell_correct:
            tokens = [spell(t) for t in tokens]
        token_list.append(tokens)
    return token_list
Example #15
0
    def __init__(self, **kwargs):
        self.conf_io = conf.load("io")
        self.conf_corenlp = conf.load("stanford_corenlp")
        self.conf_embedding = conf.load("embedding")
        conf_segmenter = self.conf_corenlp["segmenter"]
        conf_tokenizer = self.conf_corenlp["tokenizer"]
        conf_postagger = self.conf_corenlp["postagger"]
        prefix = self.conf_corenlp["prefix"]

        self.segmenter = StanfordSegmenter(
            path_to_jar=prefix + conf_segmenter["path_to_jar"],
            path_to_sihan_corpora_dict=prefix +
            conf_segmenter["path_to_sihan_corpora_dict"],
            path_to_model=prefix + conf_segmenter["path_to_model"],
            path_to_dict=prefix + conf_segmenter["path_to_dict"],
            path_to_slf4j=prefix + conf_segmenter["path_to_slf4j"],
            encoding=conf_segmenter["encoding"])
        self.enTokenizer = StanfordTokenizer(path_to_jar=prefix +
                                             conf_tokenizer["path_to_jar"])
        self.zh_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_zh"],
            path_to_jar=prefix + conf_postagger["path_to_jar"])
        self.en_tagger = StanfordPOSTagger(
            prefix + conf_postagger["tagger_en"],
            path_to_jar=prefix + conf_postagger["path_to_jar"])
        self.frequency = defaultdict(int)
        pynlpir.open()
        pynlpir.nlpir.ImportUserDict(conf.load("pynlpir")["user_dict"],
                                     Overwrite=False)

        try:
            self.excluded_docs = kwargs["excluded_docs"]
        except:
            self.excluded_docs = [""]

        # experimental features
        self.f_token_indexes = prefix + conf.load("pynlpir")["user_dict"]
def tokenize(text_rdd,
             clean_html=False,
             tokenizer="twitter",
             remove_reps=True,
             spell_correct=True):
    if tokenizer == "stanford":
        tokenizer_obj = StanfordTokenizer()
    elif tokenizer == "twitter":
        tokenizer_obj = TweetTokenizer()
    else:
        tokenizer_obj = StringTokenizer()
    print("Processing {} tokns".format(text_rdd.count()))

    if (remove_reps):
        text_rdd = text_rdd.map(
            lambda text: re.sub(r'(.)\1{2,}', r'\1\1', text))
    if clean_html:
        text_rdd = text_rdd.map(lambda text: BeautifulSoup(text).get_text())
    tokens_rdd = text_rdd.map(lambda text: TweetTokenizer().tokenize(text))
    if spell_correct:
        tokens_rdd = tokens_rdd.map(lambda tokens: [spell(t) for t in tokens])
        #tokens_rdd = tokens_rdd.map(lambda tokens: [t for t in tokens])

    return tokens_rdd
 #lines = f.read().encode('utf-8').split('</text>')
 #for index, line in enumerate(lines):
 # remove leading and trailing whitespace
 lines = file.encode('utf-8').split('</text>')
 for line in lines:
     newline = ''
     try:
         if "<text xml:space=\"preserve\">" in line and "#REDIRECT" not in line:
             newline = line[line.find("<text xml:space=\"preserve\">") +
                            len("<text xml:space=\"preserve\">"):]
             if guess_language(newline) == 'en':
                 s = re.sub(
                     '[^A-Za-z0-9\s.,\'\";?$%+-:!]+', '@',
                     re.sub('\d', '0', newline).replace('[', ' ').replace(
                         ']', ' ').replace('}', ' ').replace('{', ' '))
                 s2 = StanfordTokenizer().tokenize(s)
                 s3 = [word.encode('ascii') for word in s2]
                 charCounter = 0
                 tokenCounter = 0
                 sentStart = 0
                 deleteThese = []
                 for index, token in enumerate(s3):
                     if token == '.':
                         if charCounter < 20 or tokenCounter < 5:
                             deleteThese.append([sentStart, index])
                         charCounter = 0
                         tokenCounter = 0
                         sentStart = index + 1
                     else:
                         charCounter += len(token)
                         tokenCounter += 1
Example #18
0
# -*- coding: utf-8 -*-
from nltk.tokenize import StanfordTokenizer
import time


def is_ascii(s):
    return all(ord(c) < 128 for c in s)


last_time = time.time()
line_buffer = ''
with open('WestburyLab.Wikipedia.Corpus.txt') as infp, open(
        'TokenizedCorpus.txt', 'w') as outfp:
    for e, line in enumerate(infp):
        if (e + 1) % 10000 == 0:
            line_buffer = StanfordTokenizer().tokenize(line_buffer)
            try:
                outfp.write(' '.join(line_buffer) + '\n')
            except:
                for i in xrange(len(line_buffer)):
                    if not is_ascii(line_buffer[i]):
                        line_buffer[i] = '<UNK>'
                outfp.write(' '.join(line_buffer) + '\n')
            line_buffer = ''
            print e + 1, '/ 30749930', float(
                e + 1) / 30749930, time.time() - last_time

        if line.strip() == '':
            continue
        line_buffer += (line + ' <br> ')
Example #19
0
java_path = "C:/Program Files/Java/jre1.8.0_131/bin/java.exe"
parser_path = "D:/stanford-parser-full-2016-10-31/stanford-parser.jar"
models_path = "D:/stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar"
engPCFG_path = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"

import os
os.environ['JAVA_HOME'] = java_path

from nltk.tokenize import StanfordTokenizer
tokenizer = StanfordTokenizer(parser_path)

from nltk.parse.stanford import StanfordDependencyParser
parser = StanfordDependencyParser(parser_path, models_path, engPCFG_path)

from nltk.corpus import wordnet

import nltk
from nltk.tree import Tree
from nltk.corpus.reader.wordnet import Lemma
from nltk.corpus import semcor
from nltk.corpus import wordnet

noun = set(['NN', 'NNS', 'NNP', 'NNPS'])
verb = set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
adjective = set(['JJ', 'JJR', 'JJS'])
adverb = set(['RB', 'RBR', 'RBS'])
substantive = noun | verb | adjective | adverb

corp = semcor.sents()

tags = semcor.tagged_sents(tag = 'sem')
Example #20
0
from nltk.tag.stanford import StanfordNERTagger, StanfordPOSTagger
from nltk.tokenize import StanfordTokenizer
from wordsegment import load, segment

CUR_DIRECTORY = '/home/wmq/Desktop/DeepText/StanfordNLP'
SEGMENT_PATH = CUR_DIRECTORY + '/stanford-segmenter-3.8.0.jar'
NER_MODEL_PATH = CUR_DIRECTORY + '/english.all.3class.distsim.crf.ser.gz'
NER_JAR_PATH = CUR_DIRECTORY + '/stanford-ner.jar'
POS_MODEL_PATH = CUR_DIRECTORY + '/english-left3words-distsim.tagger'
POS_JAR_PATH = CUR_DIRECTORY + '/stanford-postagger.jar'

ner_tagger = StanfordNERTagger(NER_MODEL_PATH, NER_JAR_PATH, java_options='')
pos_tagger = StanfordPOSTagger(POS_MODEL_PATH, POS_JAR_PATH, java_options='')
tokenizer = StanfordTokenizer(SEGMENT_PATH)
load()

s = "@user nah pretty sure it's jackson's great jokes"
ws = tokenizer.tokenize(s)
print(' '.join(ws))
# print (' '.join(segment('#happythankgiving')))
# s = 'i got to to go formal with my best friend @ phi mu at jsu'.split()
# ner_sent = ner_tagger.tag(s)
# pos_sent = pos_tagger.tag(s)
# print (ner_sent)
# print (pos_sent)
Example #21
0
def en_standseg(sent):
    tokenizer = StanfordTokenizer(
        path_to_jar=r"E:\tools\stanfordNLTK\jar\stanford-parser.jar")
    print(tokenizer.tokenize(sent))
Example #22
0
def stanford_tokenize(s):
    return StanfordTokenizer().tokenize(s)
Example #23
0
from lexnlp import is_stanford_enabled
from lexnlp.nlp.en.tokens import STOPWORDS, get_lemma_list
from lexnlp.config.stanford import STANFORD_POS_PATH

__author__ = "ContraxSuite, LLC; LexPredict, LLC"
__copyright__ = "Copyright 2015-2019, ContraxSuite, LLC"
__license__ = "https://github.com/LexPredict/lexpredict-lexnlp/blob/master/LICENSE"
__version__ = "0.2.7"
__maintainer__ = "LexPredict, LLC"
__email__ = "*****@*****.**"

# Setup Stanford POS configuration
try:
    STANFORD_POS_FILE = os.path.join(STANFORD_POS_PATH,
                                     "stanford-postagger.jar")
    STANFORD_TOKENIZER = StanfordTokenizer(path_to_jar=STANFORD_POS_FILE)
    STANFORD_DEFAULT_TAG_MODEL = os.path.join(
        STANFORD_POS_PATH, "models", "english-bidirectional-distsim.tagger")
    STANFORD_TAGGER = StanfordPOSTagger(STANFORD_DEFAULT_TAG_MODEL,
                                        STANFORD_POS_FILE)
except LookupError:
    STANFORD_TOKENIZER = STANFORD_TAGGER = None


def check_stanford():
    if not is_stanford_enabled():
        raise RuntimeError(
            "USE_STANFORD is set to False. No Stanford functionality available."
        )
    if not STANFORD_TOKENIZER:
        raise RuntimeError("USE_STANFORD is set to True."
Example #24
0
segmenter=StanfordSegmenter(
    path_to_jar="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/stanford-segmenter-3.5.2.jar",
    path_to_slf4j="/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/slf4j-api.jar",
    path_to_sihan_corpora_dict="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data",
    path_to_model="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data/pku.gz",
    path_to_dict="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data/dict-chris6.ser.gz"

)
str="我在我在博客园开了一个博客。"
print (segmenter.segment(str))

#英文分词


from nltk.tokenize import StanfordTokenizer
tokenizer=StanfordTokenizer(path_to_jar=r"/home/hsiao/Develops/nlp/stanford-parser-full-2016-10-31/stanford-parser.jar")
sent = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
print (tokenizer.tokenize(sent))

#中文命名实体识别
from nltk.tag import StanfordNERTagger
chi_tagger=StanfordNERTagger(model_filename=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/classifiers/chinese.misc.distsim.crf.ser.gz'
                             ,path_to_jar=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/stanford-ner.jar')
print (chi_tagger.tag('四川省 成都 信息 工程 大学 我 在 博客 园 开 了 一个 博客 , 我 的 博客 名叫 伏 草 惟 存 , 写 了 一些 自然语言 处理 的 文章 。\r\n'.split()))




#英文命名实体识别
from nltk.tag import StanfordNERTagger
eng_tagger=StanfordNERTagger(model_filename=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/classifiers/english.all.3class.distsim.crf.ser.gz'
Example #25
0
# -*- coding:utf-8 -*-
from nltk.tokenize import StanfordTokenizer
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.tag import StanfordPOSTagger
from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.stanford import StanfordParser
import pickle as pkl

# 英文分词

tokenizer = StanfordTokenizer(
    path_to_jar=r"D:\stanford-parser-full-2016-10-31\stanford-parser.jar")
#
# sentence2 = "Micron hasn't declared its first quarterly profit for three years, but he has been dead."
# sentence = "Jim Williams director of the US VISIT project said that by the middle of November many arriving passengers in Atlanta will be fingerprinted and photographed"
#
# print(tokenizer.tokenize(sentence2))

# 英文词性标注
# 词性标注的标签说明:http://www.comp.leeds.ac.uk/amalgam/tagsets/upenn.html
# eng_tagger = StanfordPOSTagger(
#     path_to_jar=r"D:\stanford-postagger-full-2016-10-31\stanford-postagger.jar",
#     model_filename=r"D:\stanford-postagger-full-2016-10-31\models\english-bidirectional-distsim.tagger"
# )
# print("词性标注结果", eng_tagger.tag(sentence.split()))

# 句法分析

# eng_parser = StanfordParser(
#     path_to_jar=r"D:\stanford-parser-full-2016-10-31\stanford-parser.jar",
#     path_to_models_jar=r"D:\stanford-parser-full-2016-10-31\stanford-parser-3.7.0-models.jar",
Example #26
0
 def __init__(self, classifier, jar_file, field_to_process, output_field):
     self.classifier = classifier
     self.jar_file = jar_file
     self.field_to_process = field_to_process
     self.output_field = output_field
     self.tokenizer = StanfordTokenizer(path_to_jar=self.jar_file).tokenize
    elif token == '-RCB-':
        token = '}'
    return token


def tokenize_sentences(tknzr, sentences, to_lower=True):
    """Arguments:
        - tknzr: a tokenizer implementing the NLTK tokenizer interface
        - sentences: a list of sentences
        - to_lower: lowercasing or not
    """
    return [tokenize(tknzr, s, to_lower) for s in sentences]


fileName = sys.argv[1]

SNLP_TAGGER_JAR = "/home/pgupta/stanford-postagger.jar"

sentences = []
with open(fileName, 'r') as fileinput:
    for line in fileinput:
        sentences.append(line)

tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
s = ' <delimiter> '.join(sentences)
tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])
tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ')

for sentence in tokenized_sentences_SNLP:
    print(sentence)
Example #28
0
import re, nltk, sys
from nltk.tokenize import StanfordTokenizer

tokenizer = StanfordTokenizer(
    r'../common/stanford-postagger-2015-04-20/stanford-postagger.jar')
for line in open("TEST_FILE.txt"):
    m = re.match(r'^([0-9]+)\s"(.+)"$', line.strip())
    if m is not None:
        txtfile = open("test/%s.txt" % m.group(1), 'w')
        annfile = open("test/%s.ann" % m.group(1), 'w')
        line = m.group(2)
        text = []
        t = line.split("<e1>")
        text.append(t[0])
        e1start = len(t[0])
        t = t[1].split("</e1>")
        e1 = t[0]
        text.append(t[0])
        e1end = len(t[0]) + e1start
        t = t[1].split("<e2>")
        text.append(t[0])
        e2start = len(t[0]) + e1end
        t = t[1].split("</e2>")
        text.append(t[0])
        e2 = t[0]
        e2end = len(t[0]) + e2start
        text.append(t[1])
        text = " ".join(tokenizer.tokenize("".join(text)))
        txtfile.write(text)
        txtfile.write("\n")
        offset = 0
Example #29
0
abs_path = dirname(abspath(__file__))

# STANFORD TOOLS
#-------------------------------------------------------------------------------
# paths
stanford_pos_dir = abs_path + '/stanford-postagger-2015-12-09/'
postag_modelfile = stanford_pos_dir + 'models/english-bidirectional-distsim.tagger'
postag_jar = stanford_pos_dir + 'stanford-postagger.jar'
stanford_parser_dir = abs_path + '/stanford-parser-full-2015-12-09/'
parser_eng_model = stanford_parser_dir + 'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
parser_models_jar = stanford_parser_dir + "stanford-parser-3.6.0-models.jar"
parser_jar = stanford_parser_dir + "stanford-parser.jar"

# Instances:
st_tagger = StanfordPOSTagger(model_filename=postag_modelfile,
                              path_to_jar=postag_jar)

st_tknzr = StanfordTokenizer(path_to_jar=postag_jar)

st_parser = StanfordParser(model_path=parser_eng_model,
                           path_to_models_jar=parser_models_jar,
                           path_to_jar=parser_jar)

# BIKEL
#-------------------------------------------------------------------------------
# paths
bikel_dir = abs_path + '/dbparser/'
bk_parser_path = bikel_dir + 'bin/parse'
bk_settings = bikel_dir + 'settings/collins.properties'
bk_parser_model = bikel_dir + 'bikel/wsj-02-21.obj.gz'
# cmd = bikel_parser+" 400"+bk_settings+" "+bk_parser_model + inputfile
Example #30
0
info['share_indicator'] = np.nan
info['num'] = np.nan
info['person'] = np.nan
info['num_person'] = np.nan

# In[ ]:

share_num = {}
#token_dict = {}
pos_dict = {}
ent_dict = {}
share_num_all = {}

# In[ ]:

tokenizer = StanfordTokenizer()

pos = StanfordPOSTagger('english-bidirectional-distsim.tagger')

parser = StanfordParser(
    model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')

dp = StanfordDependencyParser(
    model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz')

ner = StanfordNERTagger('english.conll.4class.distsim.crf.ser.gz')

# # 2. Define functions

# ## 2.1 Joblib paralell cumputing function for loops