def splitAllWord(typeOfDataset="dev"):
    from nltk.tokenize.stanford_segmenter import StanfordSegmenter

    segmenter = StanfordSegmenter()
    segmenter.default_config('zh')

    maxCount = 2000000

    pathOfDev = "dataset/task1/%s.tsv" % typeOfDataset
    dfOfDev = pd.read_csv(pathOfDev, delimiter="\t")

    pathOfNewDev = "%s_split.tsv" % typeOfDataset

    count = 0
    with open(pathOfNewDev, "w", encoding='utf-8') as fw:
        for row in dfOfDev.iterrows():
            if count >= maxCount:
                break
            if count % 100 == 0:
                print("[%s]count = %s" % (typeOfDataset, count))

            label = row[1]['label']
            fw.write(str(label))
            fw.write("\t")
            sentence = row[1]['text_a']

            segmentOfSentence = segmenter.segment(sentence)
            for word in segmentOfSentence.split():
                fw.write(word)
                fw.write(" ")
            fw.write("\n")

            count += 1
def segment(labels, reviews):

    segmented = []

    print('Creating BOW')
    # seg = StanfordSegmenter('../../datasets/data-hauyi/stanford-segmenter-2018-10-16')
    os.environ[
        "STANFORD_SEGMENTER"] = '../datasets/data-hauyi/stanford-segmenter-2018-10-16'
    seg = StanfordSegmenter(
        '../datasets/data-hauyi/stanford-segmenter-2018-10-16/stanford-segmenter-3.9.2.jar'
    )
    seg.default_config('zh', )
    count = 0

    file_out = open('reviews.txt', 'a+')

    for i in range(len(reviews)):
        # print(i)
        s = seg.segment(reviews[i])
        l = labels[i]
        # print(s)
        line = str(l) + ' ' + s
        file_out.write(line)
        segmented.append(s)
        # print('Tokenize: ')
        # print(seg.tokenize(s))
        count = count + 1
        # if count > 5:
        #     break
        print('Count: ', count)

    return (segmented)
    def __init__(self):
        self.root_path = '../Models/stanfordNLP/'

        # word segmenter
        self.segmenter = StanfordSegmenter(
            path_to_jar=self.root_path + "stanford-segmenter.jar",
            path_to_slf4j=self.root_path + "log4j-over-slf4j.jar",
            path_to_sihan_corpora_dict=self.root_path + "segmenter/",
            path_to_model=self.root_path + "segmenter/pku.gz",
            path_to_dict=self.root_path + "segmenter/dict-chris6.ser.gz")

        # pos tagger
        self.posTagger = StanfordPOSTagger(
            self.root_path + 'pos-tagger/chinese-distsim.tagger',
            path_to_jar=self.root_path + "stanford-postagger.jar")

        # named entity recognizer
        self.nerTagger = StanfordNERTagger(
            self.root_path + 'ner/chinese.misc.distsim.crf.ser.gz',
            path_to_jar=self.root_path + 'stanford-ner.jar')

        self.parser = StanfordDependencyParser(
            model_path=self.root_path + 'lexparser/chinesePCFG.ser.gz',
            path_to_jar=self.root_path + 'stanford-parser.jar',
            path_to_models_jar=self.root_path +
            'stanford-parser-3.7.0-models.jar',
            encoding='gbk')
Exemple #4
0
def segmenter(sentence):
    r"""
	Stanford Word Segmenter for Chinese.

	Split Chinese sentence into a sequence of words.

	Args:
		sentence:A Chinese sentence

	Returns:
		A list decode in utf-8

	Example:
		sentence="广东外语外贸大学是一所具有鲜明国际化特色的广东省属重点大学,是华南地区国际化人才培养和外国语言文化、对外经济贸易、国际战略研究的重要基地。"
		[u'\u5e7f\u4e1c', u'\u5916\u8bed', u'\u5916\u8d38', u'\u5927\u5b66', u'\u662f', u'\u4e00', u'\u6240', u'\u5177\u6709', u'\u9c9c\u660e', u'\u56fd\u9645\u5316', u'\u7279\u8272', u'\u7684', u'\u5e7f\u4e1c', u'\u7701\u5c5e', u'\u91cd\u70b9', u'\u5927\u5b66', u'\uff0c', u'\u662f', u'\u534e\u5357', u'\u5730\u533a', u'\u56fd\u9645\u5316', u'\u4eba\u624d', u'\u57f9\u517b', u'\u548c', u'\u5916\u56fd', u'\u8bed\u8a00', u'\u6587\u5316', u'\u3001', u'\u5bf9\u5916', u'\u7ecf\u6d4e', u'\u8d38\u6613', u'\u3001', u'\u56fd\u9645', u'\u6218\u7565', u'\u7814\u7a76', u'\u7684', u'\u91cd\u8981', u'\u57fa\u5730', u'\u3002']

	"""

    from nltk.tokenize.stanford_segmenter import StanfordSegmenter  #初始化斯坦福中文分词器
    segmenter = StanfordSegmenter(
        path_to_jar=
        'D:/python/nltk-3.1/nltk/chin/stanford-segmenter-2014-08-27/stanford-segmenter-3.4.1.jar',
        path_to_sihan_corpora_dict=
        'D:/python/nltk-3.1/nltk/chin/stanford-segmenter-2014-08-27/data',
        path_to_model=
        'D:/python/nltk-3.1/nltk/chin./stanford-segmenter-2014-08-27/data/pku.gz',
        path_to_dict=
        'D:/python/nltk-3.1/nltk/chin./stanford-segmenter-2014-08-27/data/dict-chris6.ser.gz'
    )  #加载中文分词模型

    sent = segmenter.segment(sentence)  #分词
    return sent.split()
def segment_sentences(sentence_list):
    segmenter = StanfordSegmenter(
        java_class=r"edu.stanford.nlp.ie.crf.CRFClassifier",
        path_to_jar=os.path.join(stanford_corenlp_path,
                                 'stanford-segmenter-2018-02-27',
                                 'stanford-segmenter-3.9.1.jar'),
        path_to_slf4j=os.path.join(stanford_corenlp_path,
                                   'slf4j-api-1.7.25.jar'),
        path_to_sihan_corpora_dict=os.path.join(
            stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'data'),
        path_to_model=os.path.join(stanford_corenlp_path,
                                   'stanford-segmenter-2018-02-27', 'data',
                                   'pku.gz'),
        path_to_dict=os.path.join(stanford_corenlp_path,
                                  'stanford-segmenter-2018-02-27', 'data',
                                  'dict-chris6.ser.gz'),
        sihan_post_processing='true')
    result = segmenter.segment_sents(sentence_list)
    result = result.strip()
    segmented_list = re.split(os.linesep, result)
    if len(segmented_list[-1]) == 0:
        segmented_list = segmented_list[:-1]
    if len(segmented_list) != len(sentence_list):
        for i in range(len(segmented_list)):
            ss = ''.join(segmented_list[i].split())
            if ss != sentence_list[i]:
                print(i, '|', segmented_list[i], '|', sentence_list[i])
                # break
        print(len(segmented_list), len(sentence_list))
    assert len(segmented_list) == len(sentence_list)
    return segmented_list
Exemple #6
0
def ch_standseg(mystr):
    segmenter = StanfordSegmenter(
        path_to_jar=r"E:\tools\stanfordNLTK\jar\stanford-segmenter.jar",
        path_to_slf4j=r"E:\tools\stanfordNLTK\jar\slf4j-api.jar",
        path_to_sihan_corpora_dict=r"E:\tools\stanfordNLTK\jar\data",
        path_to_model=r"E:\tools\stanfordNLTK\jar\data\pku.gz",
        path_to_dict=r"E:\tools\stanfordNLTK\jar\data\dict-chris6.ser.gz")
    result = segmenter.segment(mystr)
    print(result)
Exemple #7
0
 def __init__(self):
     file_path = path.realpath(__file__)
     dir_path = path.dirname(file_path)
     self.path_to_jar = path.join(dir_path, 'stanford-segmenter-3.9.2.jar')
     self.path_to_model = path.join(dir_path, 'data/ctb.gz')  # pku.gz
     self.path_to_dict = path.join(dir_path, 'data/dict-chris6.ser.gz')
     self.path_to_sihan_corpora_dict = path.join(dir_path, 'data/')
     self.seg = StanfordSegmenter(
         path_to_jar=self.path_to_jar,
         java_class='edu.stanford.nlp.ie.crf.CRFClassifier',
         path_to_model=self.path_to_model,
         path_to_dict=self.path_to_dict,
         path_to_sihan_corpora_dict=self.path_to_sihan_corpora_dict)
Exemple #8
0
 def __init__(self):
     print('stanford segmenter init...')
     # stanford_corenlp_path = r'/media/mcislab/sdb1/home/mcislab/zwt/stanford_core_nlp'
     stanford_corenlp_path = r"D:\Desktop\stanford corenlp"
     self.segmenter = StanfordSegmenter(
         java_class=r"edu.stanford.nlp.ie.crf.CRFClassifier",
         path_to_jar=os.path.join(stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'stanford-segmenter-3.9.1.jar'),
         path_to_slf4j=os.path.join(stanford_corenlp_path, 'slf4j-api-1.7.25.jar'),
         path_to_sihan_corpora_dict=os.path.join(stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'data'),
         path_to_model=os.path.join(stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'data', 'pku.gz'),
         path_to_dict=os.path.join(stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'data', 'dict-chris6.ser.gz'),
         sihan_post_processing='true'
     )
def segment():
    """
    split a Chinese sentence into words
    :return:
    """
    segmenter = StanfordSegmenter(
        java_class=r"edu.stanford.nlp.ie.crf.CRFClassifier",
        path_to_jar=r"D:\Desktop\stanford corenlp\stanford-segmenter-2018-02-27\stanford-segmenter-3.9.1.jar",
        path_to_slf4j=r"D:\Desktop\stanford corenlp\slf4j-api-1.7.25.jar",
        path_to_sihan_corpora_dict=r"D:\Desktop\stanford corenlp\stanford-segmenter-2018-02-27\data",
        path_to_model=r"D:\Desktop\stanford corenlp\stanford-segmenter-2018-02-27\data\pku.gz",
        path_to_dict=r"D:\Desktop\stanford corenlp\stanford-segmenter-2018-02-27\data\dict-chris6.ser.gz",
        sihan_post_processing='true'
    )   # path to jar files should be changed

    # result = segmenter.segment(s)
    result = segmenter.segment_sents(["一个人在切西红柿", "这个把手该换了", "别把手放在我的肩膀上", "他正在量和服尺寸"])

    print(result)
Exemple #10
0
class StanfordTokenizer:
    """
    class for segmenting Chinese sentences
    uses stanford segmenter 3.9.1
    """
    def __init__(self):
        stanford_corenlp_path = r'/media/mcislab/sdb1/home/mcislab/zwt/stanford_core_nlp'
        self.segmenter = StanfordSegmenter(
            java_class=r"edu.stanford.nlp.ie.crf.CRFClassifier",
            path_to_jar=os.path.join(stanford_corenlp_path,
                                     'stanford-segmenter-2018-02-27',
                                     'stanford-segmenter-3.9.1.jar'),
            path_to_slf4j=os.path.join(stanford_corenlp_path,
                                       'slf4j-api-1.7.25.jar'),
            path_to_sihan_corpora_dict=os.path.join(
                stanford_corenlp_path, 'stanford-segmenter-2018-02-27',
                'data'),
            path_to_model=os.path.join(stanford_corenlp_path,
                                       'stanford-segmenter-2018-02-27', 'data',
                                       'pku.gz'),
            path_to_dict=os.path.join(stanford_corenlp_path,
                                      'stanford-segmenter-2018-02-27', 'data',
                                      'dict-chris6.ser.gz'),
            sihan_post_processing='true')

    def segment_sents(self, sentences):
        result = self.segmenter.segment_sents(sentences)
        result = result.strip()
        segmented_list = re.split(os.linesep, result)
        if len(segmented_list[-1]) == 0:
            segmented_list = segmented_list[:-1]
        print(len(segmented_list), len(sentences))
        assert len(segmented_list) == len(sentences)
        return segmented_list

    def tokenize(self, captions_for_images):
        image_id_list = []
        caption_list = []
        for (image_id, captions) in captions_for_images.items():
            for caption in captions:
                caption_list.append(caption['caption'])
                image_id_list.append(image_id)

        segmented_caption_list = self.segment_sents(caption_list)
        assert len(image_id_list) == len(caption_list) and len(
            caption_list) == len(segmented_caption_list)

        tokenized_captions_for_images = {}
        for i in range(len(image_id_list)):
            image_id = image_id_list[i]
            if image_id not in tokenized_captions_for_images:
                tokenized_captions_for_images[image_id] = []
            tokenized_captions_for_images[image_id].append(
                segmented_caption_list[i])
        return tokenized_captions_for_images
Exemple #11
0
def stanford_seg(path, stage='train'):
    df_dir = os.path.join(path, '{}.csv'.format(stage))
    data = pd.read_csv(df_dir, error_bad_lines=False, dtype=object)
    data = data.dropna(axis=0, how='any').reset_index(drop=True)
    data_dir = '/home/trueto/stanford_segmenter/'
    seg = StanfordSegmenter(path_to_jar=data_dir + 'stanford-segmenter.jar',
                            java_class='edu.stanford.nlp.ie.crf.CRFClassifier',
                            path_to_sihan_corpora_dict=data_dir + "data",
                            path_to_model=data_dir + 'data/pku.gz',
                            path_to_dict=data_dir + "data/dict-chris6.ser.gz")
    columns = data.columns
    for column in columns:
        if column in ['question1', 'question2']:
            column_file = os.path.join(path, 'cut',
                                       '{}_{}.txt'.format(stage, column))
            data[column].to_csv(column_file, index=False)
            cut_file = os.path.join(path, 'cut',
                                    '{}_{}_cut.txt'.format(stage, column))
            with open(cut_file, 'w') as f:
                f.write(seg.segment_file(column_file))
Exemple #12
0
    def __init__(self, language):

        self.language = language

        model = self.models.get(language)
        dic = self.dics.get(language)
        if not model:
            raise (Exception(
                "Unsupported language for tokenizer: {}".format(language)))

        # Initialize Stanford Tokenizer
        self.tm_tokenize = StanfordSegmenter(
            path_to_jar=os.path.join(stanford_tokenizer_home,
                                     'stanford-segmenter-3.6.0.jar'),
            path_to_model=os.path.join(stanford_tokenizer_home, 'data', model),
            path_to_dict=os.path.join(stanford_tokenizer_home, 'data', dic),
            path_to_sihan_corpora_dict=os.path.join(stanford_tokenizer_home,
                                                    'data'),
            path_to_slf4j=os.path.join(stanford_tokenizer_home,
                                       'slf4j-api.jar'))
Exemple #13
0
def get_stanford_segmenter():
    if not os.path.isdir(STANFORD_SEGMENTER_DIR):
        download_stanford_segmenter()
    global STANFORD_SEGMENTER
    if not STANFORD_SEGMENTER:
        STANFORD_SEGMENTER = StanfordSegmenter(
            path_to_jar=STANFORD_SEGMENTER_JAR,
            path_to_sihan_corpora_dict=STANFORD_SIHAN_CORPORA_DICT,
            path_to_model=STANFORD_MODEL,
            path_to_dict=STANFORD_DICT,
            verbose=True
        )
    return STANFORD_SEGMENTER
Exemple #14
0
def Segmenter(segmenter_folder_name='',  
		segmenter_jarname = '',
		segmenter_folder='',  
		segmenter_jarpath='', 
		segmenter_corpora='', 
		segmenter_model='', 
		segmenter_dictpath='', 
		segmenter_slfpath=''):
	###
	default_segmenter_folder_name='stanford-segmenter-2017-06-09'
	if len(segmenter_folder_name)==0:
		segmenter_folder_name = default_segmenter_folder_name
	###
	default_segmenter_jarname = 'stanford-segmenter-3.8.0.jar'
	if len(segmenter_jarname) == 0:
		segmenter_jarname = default_segmenter_jarname
	###
	default_segmenter_folder = os.path.join(os.path.expanduser('~'), 'Stanford NLP', segmenter_folder_name)
	if len(segmenter_folder)==0:
		segmenter_folder = default_segmenter_folder
	###
	default_segmenter_jarpath = os.path.join(segmenter_folder, segmenter_jarname)
	if len(segmenter_jarpath) == 0:
		segmenter_jarpath = default_segmenter_jarpath
	###
	default_segmenter_corpora = os.path.join(segmenter_folder, 'data')
	if len(segmenter_corpora) == 0:
		segmenter_corpora = default_segmenter_corpora
	###
	default_segmenter_model = os.path.join(segmenter_folder, 'data', 'pku.gz')
	if len(segmenter_model) == 0:
		segmenter_model = default_segmenter_model
	###
	default_segmenter_dictpath = os.path.join(segmenter_folder, 'data', 'dict-chris6.ser.gz')
	if len(segmenter_dictpath) == 0:
		segmenter_dictpath = default_segmenter_dictpath
	###
	default_segmenter_slfpath = os.path.join(segmenter_folder, 'slf4j-api.jar')
	if len(segmenter_slfpath) == 0:
		segmenter_slfpath = default_segmenter_slfpath
	###
	nltk.internals.config_java("")
	######
	segmenter = StanfordSegmenter(java_class="edu.stanford.nlp.ie.crf.CRFClassifier", path_to_jar=segmenter_jarpath, path_to_sihan_corpora_dict=segmenter_corpora, path_to_model=segmenter_model, path_to_dict=segmenter_dictpath, path_to_slf4j=segmenter_slfpath)
	# segmenter.default_config('zh')
	######
	return segmenter
Exemple #15
0
class TMStanfordTokenizer():

    models = {'ZH': 'ctb.gz', 'AR': 'arabic-segmenter-atb+bn+arztrain.ser.gz'}

    dics = {'ZH': 'dict-chris6.ser.gz', 'AR': ''}

    def __init__(self, language):

        self.language = language

        model = self.models.get(language)
        dic = self.dics.get(language)
        if not model:
            raise (Exception(
                "Unsupported language for tokenizer: {}".format(language)))

        # Initialize Stanford Tokenizer
        self.tm_tokenize = StanfordSegmenter(
            path_to_jar=os.path.join(stanford_tokenizer_home,
                                     'stanford-segmenter-3.6.0.jar'),
            path_to_model=os.path.join(stanford_tokenizer_home, 'data', model),
            path_to_dict=os.path.join(stanford_tokenizer_home, 'data', dic),
            path_to_sihan_corpora_dict=os.path.join(stanford_tokenizer_home,
                                                    'data'),
            path_to_slf4j=os.path.join(stanford_tokenizer_home,
                                       'slf4j-api.jar'))

    #Input: String
    #Output: 这 是 斯坦福 中文 分词 器 测试
    def process(self, sentences):
        text = self.tm_tokenize.segment(sentences).strip('\n')
        if re.search(TOK_PATTERN, text):  # Check if the text have tags
            text = XmlUtils.join_tags(text, JOIN_PATTERN)
        return text

    def tokenize_sent(self, text):
        if self.language == 'ZH':
            return [s + '。' for s in text.split('。')
                    if s]  # Split by sentence chinese
        #self.tm_tokenize.segment_sents(text)
        return [text]
Exemple #16
0
class SegmentWorker:
    def __init__(self):
        file_path = path.realpath(__file__)
        dir_path = path.dirname(file_path)
        self.path_to_jar = path.join(dir_path, 'stanford-segmenter-3.9.2.jar')
        self.path_to_model = path.join(dir_path, 'data/ctb.gz')  # pku.gz
        self.path_to_dict = path.join(dir_path, 'data/dict-chris6.ser.gz')
        self.path_to_sihan_corpora_dict = path.join(dir_path, 'data/')
        self.seg = StanfordSegmenter(
            path_to_jar=self.path_to_jar,
            java_class='edu.stanford.nlp.ie.crf.CRFClassifier',
            path_to_model=self.path_to_model,
            path_to_dict=self.path_to_dict,
            path_to_sihan_corpora_dict=self.path_to_sihan_corpora_dict)

    def seg_file(self, file_to_segment):
        """segment a file and return the result string"""
        seg_result = self.seg.segment_file(file_to_segment)

        translator = str.maketrans('', '', string.digits)
        seg_result = seg_result.translate(translator)
        seg_result = re.sub('[\\\\.!/_,$%^*(+\\"\']+|[+—!,:;。?、~@#¥%…&*()]+',
                            '', seg_result)
        # print(seg_result)
        return seg_result

    def seg_file2list(self, file_to_segment):
        """segment a text file and return array of tokens"""
        seg_result = self.seg_file(file_to_segment)
        # print(seg_result)
        return seg_result.split()

    def seg_file2file(self, origin_file, dest_file):
        """segment a text file and write result tokens to another file"""
        seg_result = self.seg_file(origin_file)
        seg_result = re.sub('\\s+', ' ', seg_result)
        # print(seg_result)
        with open(dest_file, 'w', encoding='UTF-8') as f:
            f.write(seg_result)
Exemple #17
0
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.tokenize.stanford import CoreNLPTokenizer
# from nltk.tokenize.stanford import

path = "D:/www/data/nlpsoftware/stanford-segmenter"
segmenter = StanfordSegmenter(
    path_to_jar=path + "/stanford-segmenter.jar",
    path_to_sihan_corpora_dict=path + "/data",
    path_to_model=path + "/data/pku.gz",
    path_to_dict=path + "/data/dict-chris6.ser.gz",
    java_class='edu.stanford.nlp.ie.crf.CRFClassifier')
#
sentence = u"这是斯坦福中文分词器测试"
sentence = u"工信处女干事每月经过   下属   科室都要亲口交代24口交换机等技术性器件的安装工作"

segmenter.tokenize_sents(u"工信处")
result = segmenter.segment(sentence)
result2 = segmenter.segment_file(
    "D:/www/data/nlpdata/icwb2-data/testing/pku_test.utf8")
clean_content = "D:\\www\\data\\Weibo Data\\Weibo Data\\nlp/clean_content.txt"
# clean_content_out="D:\\www\\data\\Weibo Data\\Weibo Data\\nlp/clean_content_out.txt"
# result3 = segmenter.segment_file(clean_content)
print(type(result2))

# with open(clean_content_out,'wb+') as f:
#     f.writelines([(s+"\r\n").encode('utf8') for s in  clean_content_out])
print(result2)
# outfile = open("D:/www/data/nlpsoftware/outfile.txt",'w')
# outfile.write(result)
# outfile.close()
#
Exemple #18
0
def segment(sentence):
    path = '/media/razor/Files/Python27/nltk_data/stanford-segmenter-2015-12-09/'
    segmenter = StanfordSegmenter(path + 'stanford-segmenter-3.6.0.jar', path + 'slf4j-api.jar', path + 'data/pku.gz', path + 'data/dict-chris6.ser.gz')

    return segmenter.segment(u'我爱北京天安门')

from nltk.tokenize.stanford_segmenter import StanfordSegmenter
seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')

seg = StanfordSegmenter()
seg.default_config('zh')
sent = u'这是斯坦福中文分词器测试'
print(seg.segment(sent))
class NLPCore:
    """
    nlp processing including Stanford Word Segmenter, Stanford POS Tagger, 
    Stanford Named Entity Recognizer and Stanford Parser 
    """
    def __init__(self):
        self.root_path = '../Models/stanfordNLP/'

        # word segmenter
        self.segmenter = StanfordSegmenter(
            path_to_jar=self.root_path + "stanford-segmenter.jar",
            path_to_slf4j=self.root_path + "log4j-over-slf4j.jar",
            path_to_sihan_corpora_dict=self.root_path + "segmenter/",
            path_to_model=self.root_path + "segmenter/pku.gz",
            path_to_dict=self.root_path + "segmenter/dict-chris6.ser.gz")

        # pos tagger
        self.posTagger = StanfordPOSTagger(
            self.root_path + 'pos-tagger/chinese-distsim.tagger',
            path_to_jar=self.root_path + "stanford-postagger.jar")

        # named entity recognizer
        self.nerTagger = StanfordNERTagger(
            self.root_path + 'ner/chinese.misc.distsim.crf.ser.gz',
            path_to_jar=self.root_path + 'stanford-ner.jar')

        self.parser = StanfordDependencyParser(
            model_path=self.root_path + 'lexparser/chinesePCFG.ser.gz',
            path_to_jar=self.root_path + 'stanford-parser.jar',
            path_to_models_jar=self.root_path +
            'stanford-parser-3.7.0-models.jar',
            encoding='gbk')

    def split_sent_stanford(self, textPair):
        """
        Stanford Word Segmenter, input should be raw text
        :return: also TextPair with raw string of results
        """
        t1 = self.segmenter.segment(textPair.t1)
        t2 = self.segmenter.segment(textPair.t1)

        if DEBUG:
            print(t1, t2)

        return text_pair.TextPair(t1, t2, textPair.label)

    def split_sents_stanford(self, textPairs):
        """
        Stanford Word Segmenter, input should be list of sents
        :return: also TextPair with raw string of results
        """
        sents1 = [textPair.t1 for textPair in textPairs]
        sents2 = [textPair.t2 for textPair in textPairs]

        split1 = self.segmenter.segment_sents(sents1).split('\n')
        split2 = self.segmenter.segment_sents(sents2).split('\n')

        rlist = []
        for i in range(len(textPairs)):
            rlist.append(
                text_pair.TextPair(split1[i], split2[i], textPairs[i].label))

            if DEBUG:
                print(split1[i], split2[i])

        return rlist

    def split_sent_jieba(self, textPair):

        jieba.setLogLevel('INFO')
        ger1 = jieba.cut(textPair.t1)
        ger2 = jieba.cut(textPair.t2)

        t1 = ' '.join(ger1)
        t2 = ' '.join(ger2)

        return text_pair.TextPair(t1, t2, textPair.label)

    def pos_tag(self, textPair):
        """
        Stanford POS Tagger, input should be splitted
        :return: also TextPair with raw string of results
        """
        t1_s = textPair.t1.split()
        t2_s = textPair.t2.split()

        t1_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t1_s)])
        t2_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t2_s)])

        if DEBUG:
            print(t1_tag, t2_tag)

        return text_pair.TextPair(t1_tag, t2_tag, textPair.label)

    def pos_tag_pairs(self, textPairs):
        """
        Stanford POS Tagger, input should be list of sents
        :return: also TextPair with raw string of results
        """
        sents1 = [textPair.t1.split() for textPair in textPairs]
        sents2 = [textPair.t2.split() for textPair in textPairs]

        tag1 = self.posTagger.tag_sents(sents1)
        tag2 = self.posTagger.tag_sents(sents2)

        rlist = []
        for i in range(len(tag1)):
            t1_tag = ' '.join([ele[1] for ele in tag1[i]])
            t2_tag = ' '.join([ele[1] for ele in tag2[i]])

            rlist.append(text_pair.TextPair(t1_tag, t2_tag,
                                            textPairs[i].label))

            if DEBUG:
                print(t1_tag, t2_tag)

        return rlist

    def ner_tag(self, textPair):
        """
        Stanford Named Entity Recognizer, input should be splitted
        :return: also TextPair with raw string of results
        """
        t1_s = textPair.t1.split()
        t2_s = textPair.t2.split()

        t1_ner = ' '.join(
            [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t1_s)])
        t2_ner = ' '.join(
            [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t2_s)])

        if DEBUG:
            print(t1_ner, t2_ner)

        return text_pair.TextPair(t1_ner, t2_ner, textPair.label)

    def ner_tag_pairs(self, textPairs):
        """
        Stanford Named Entity Recognizer, input should be list of sents
        :return: also TextPair with raw string of results
        """
        sents1 = [textPair.t1.split() for textPair in textPairs]
        sents2 = [textPair.t2.split() for textPair in textPairs]

        tag1 = self.nerTagger.tag_sents(sents1)
        tag2 = self.nerTagger.tag_sents(sents2)

        rlist = []
        for i in range(len(tag1)):
            t1_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag1[i]])
            t2_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag2[i]])

            rlist.append(text_pair.TextPair(t1_ner, t2_ner,
                                            textPairs[i].label))

            if DEBUG:
                print(t1_ner, t2_ner)

        return rlist

    def depen_parse(self, textPair):
        """
        Stanford Dependency Parser, input should be splitted
        :return: also TextPair with raw string of results
        """
        print([p.tree() for p in self.parser.raw_parse(textPair.t1)])
Exemple #21
0
# find_entity_t = test.find_entity()
# find_VP_t = test.firstVP()
# test.drawTree()
test.show(firstNP_t)
# test.show(find_entity_t)
# test.show(find_VP_t)
# # test.show(find_entity_t)
# test.show(firstMinNP_t)
result = test.find_realtionship(firstNP_t)
print(result)
test.drawTree()
#
#
# print(test.rel)
# test.show(test.find_realtionship())

# 对比实验
chi_parser = StanfordParser(path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar',
                            path_to_models_jar='../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar',
                            model_path='../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
data_dir='../stanford-segmenter-2018-02-27/'
segmenter = StanfordSegmenter(path_to_jar=data_dir+"stanford-segmenter-3.9.1.jar",
                              path_to_sihan_corpora_dict=data_dir+"/data", path_to_model=data_dir+"/data/pku.gz",
                              path_to_dict=data_dir+"/data/dict-chris6.ser.gz",
                              java_class='edu.stanford.nlp.ie.crf.CRFClassifier',
                              )
result=segmenter.segment(test_str)
result_ls = result.split()
ch_tree = list(chi_parser.parse(result_ls))[0]
ch_tree.draw()
# print(result)
# encoding: utf-8
import nltk
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.tag import StanfordNERTagger

segmenter = StanfordSegmenter(
    #分词依赖的jar包
    path_to_jar=
    r"/home/jiangix/document/stanford-segmenter/stanford-segmenter.jar",
    path_to_slf4j=r"/home/jiangix/document/slf4j/slf4j-api.jar",
    #分词数据文件夹
    path_to_sihan_corpora_dict=
    r"/home/jiangix/document/stanford-segmenter/data",
    #基于北大在2005backoof上提供的人名日报语料库
    path_to_model=r"/home/jiangix/document/stanford-segmenter/data/pku.gz",
    path_to_dict=
    r"/home/jiangix/document/stanford-segmenter/data/dict-chris6.ser.gz")

segmenter.default_config('zh')
result = segmenter.segment(u'我喜欢学习编程')

chi_tagger = StanfordNERTagger(
    model_filename=
    r"/home/jiangix/document/stanford-chinese-corenlp-models/chinese.misc.distsim.crf.ser.gz",
    path_to_jar=r"/home/jiangix/document/stanford-ner/stanford-ner.jar")
for word, tag in chi_tagger.tag(result.split()):
    print(word, tag)
Exemple #23
0
#coding:utf-8


from nltk.tokenize.stanford_segmenter import StanfordSegmenter

#中文分词
segmenter=StanfordSegmenter(
    path_to_jar="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/stanford-segmenter-3.5.2.jar",
    path_to_slf4j="/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/slf4j-api.jar",
    path_to_sihan_corpora_dict="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data",
    path_to_model="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data/pku.gz",
    path_to_dict="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data/dict-chris6.ser.gz"

)
str="我在我在博客园开了一个博客。"
print (segmenter.segment(str))

#英文分词


from nltk.tokenize import StanfordTokenizer
tokenizer=StanfordTokenizer(path_to_jar=r"/home/hsiao/Develops/nlp/stanford-parser-full-2016-10-31/stanford-parser.jar")
sent = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
print (tokenizer.tokenize(sent))

#中文命名实体识别
from nltk.tag import StanfordNERTagger
chi_tagger=StanfordNERTagger(model_filename=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/classifiers/chinese.misc.distsim.crf.ser.gz'
                             ,path_to_jar=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/stanford-ner.jar')
print (chi_tagger.tag('四川省 成都 信息 工程 大学 我 在 博客 园 开 了 一个 博客 , 我 的 博客 名叫 伏 草 惟 存 , 写 了 一些 自然语言 处理 的 文章 。\r\n'.split()))
Exemple #24
0
# from nltk.tokenize.stanford_segmenter import StanfordSegmenter
# segmenter = StanfordSegmenter(path_to_jar="stanford-segmenter-3.4.1.jar", path_to_sihan_corpora_dict="./data", path_to_model="./data/pku.gz", path_to_dict="./data/dict-chris6.ser.gz")
# sentence = u"这是斯坦福中文分词器测试"
# segmenter.segment(sentence)
#  # u'\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5\n'
# segmenter.segment_file("test.simp.utf8")
# # u'\u9762\u5bf9 \u65b0 \u4e16\u7eaa \uff0c \u4e16\u754c \u5404\u56fd ...
##下载
# import nltk
# nltk.download()

from nltk.tokenize.stanford_segmenter import StanfordSegmenter
segmenter = StanfordSegmenter(
     path_to_jar="stanford-segmenter-3.6.0.jar",
     path_to_slf4j = "slf4j-api.jar",
     path_to_sihan_corpora_dict="./data",
     path_to_model="./data/pku.gz",
     path_to_dict="./data/dict-chris6.ser.gz")
sentence = u"这是斯坦福中文分词器测试"
segmenter.segment(sentence)
# >>> u'\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5\n'
segmenter.segment_file("test.simp.utf8")
# >>> u'\u9762\u5bf9 \u65b0 \u4e16\u7eaa \uff0c \u4e16\u754c \u5404\u56fd ...

# 英文测试
# import nltk
# text = 'i am a good boy.you are a bad girl'
# sens = nltk.sent_tokenize(text)
# print(sens)
# words = []
# for sent in sens:
Exemple #25
0
#coding:UTF-8
from nltk.tokenize.stanford_segmenter import StanfordSegmenter

segmenter = StanfordSegmenter(
    path_to_jar=
    r"D:\StanfordNLP\stanford-segmenter\stanford-segmenter-3.6.0.jar",
    path_to_slf4j=r"D:\StanfordNLP\stanford-segmenter\slf4j-api.jar",
    path_to_sihan_corpora_dict=r"D:\StanfordNLP\stanford-segmenter\data",
    path_to_model=r"D:\StanfordNLP\stanford-segmenter\data\pku.gz",
    path_to_dict=r"D:\StanfordNLP\stanford-segmenter\data\dict-chris6.ser.gz")
str = u"我在博客园开了一个博客,我的博客名叫伏草惟存,写了一些自然语言处理的文章。"
result = segmenter.segment(str)
print result
Exemple #26
0
                             'helpers', "englishPCFG.ser.gz")
model_de_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             'helpers', "germanPCFG.ser.gz")
jar_model_de_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 'helpers/stanford-corenlp-full-2016-10-31')
model_cn_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             'helpers', "chinesePCFG.ser.gz")
model_ch_lex_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 'helpers', "chineseFactored.ser.gz")

segmenter = StanfordSegmenter(
    path_to_jar=os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             'helpers', "stanford-segmenter.jar"),
    path_to_sihan_corpora_dict=os.path.join(
        os.path.dirname(os.path.realpath(__file__)), 'helpers',
        "stanford-segmenter-2015-12-09/data"),
    path_to_model=os.path.join(os.path.dirname(os.path.realpath(__file__)),
                               'helpers',
                               "stanford-segmenter-2015-12-09/data/pku.gz"),
    path_to_dict=os.path.join(
        os.path.dirname(os.path.realpath(__file__)), 'helpers',
        "stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz"))

#parser_en = StanfordParser(model_path = model_en_path)
#parser_de = StanfordParser(model_path = model_de_path)
#parser_cn = StanfordParser(model_path = model_cn_path)
#parser_ch_lex = StanfordParser(model_path = model_ch_lex_path)

dep_parser_en = StanfordDependencyParser(model_path=model_en_path)
dep_parser_de = StanfordDependencyParser(model_path=model_de_path)
dep_parser_cn = StanfordDependencyParser(model_path=model_cn_path)
Exemple #27
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from nltk.tokenize.stanford_segmenter import StanfordSegmenter

seg = StanfordSegmenter(
    path_to_jar=r'D:\temp\stanford\stanford-segmenter.jar',
    path_to_slf4j=r'D:\temp\stanford\slf4j-api.jar',
    path_to_sihan_corpora_dict=r'D:\temp\stanford\data',
    path_to_model=r'D:\temp\stanford\data\ctb.gz',
    path_to_dict=r'D:\temp\stanford\data\dict-chris6.ser.gz',
)

# sentence = "这是斯坦福中文分词器测试"
sentence = "操你大爷,狗日的"
res = seg.segment(sentence)
print res

# import jieba
# ss = jieba.cut(sentence)
# print ' '.join(list(ss)), type(ss)

Exemple #28
0
zh = open(zh_dir,'r')

len_p = number_file(en_dir)

corpus = []
for x in range(0, len_p):
	corpus.append([en.readline().strip(),zh.readline().strip()])

###############################

###tokenize chinese####
print '########Starting tokenization###########\n'
##
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
pre_path = '/home/db32555/MM/stanford-segmenter/'
segmenter = StanfordSegmenter(path_to_jar=pre_path+'stanford-segmenter-3.4.1.jar', path_to_sihan_corpora_dict=pre_path+'./data', path_to_model=pre_path+'./data/pku.gz', path_to_dict=pre_path+'./data/dict-chris6.ser.gz')
##setup_end##
from nltk import word_tokenize
##setup_end_eng##
for node in corpus:
	index = corpus.index(node)
	chinese = node[1]
	chinese = unicode(chinese, 'utf-8')
	tmp_segmented = segmenter.segment(chinese)
	tmp_segmented = tmp_segmented.split(" ")
	#
	del corpus[index][1]
	corpus[index].append(tmp_segmented)	
	print tmp_segmented
	##this is chinese 
	english = node[0]
Exemple #29
0
import random, sys

# import nltk
from nltk.tokenize.stanford_segmenter import StanfordSegmenter

jar = '/Users/sinansmac/Public/StanfordNLP/stanford-segmenter-2017-06-09/stanford-segmenter-3.8.0.jar'
api_jar = '/Users/sinansmac/Public/StanfordNLP/stanford-parser-full-2017-06-09/slf4j-api.jar'
# model = '/Users/sinansmac/Public/StanfordNLP/stanford-segmenter-2017-06-09/data/dict-chris6.ser.gz'

seg = StanfordSegmenter(path_to_jar=jar, path_to_slf4j=api_jar)
seg.default_config('zh')
# sent = u'这是斯坦福中文分词器测试'
# print(seg.segment(sent))


class MarkovZh:
    def __init__(self):
        self.suffix_map = {}
        self.prefix = ()

    def process_file(self, filename, order=1):
        fp = open(filename)
        # self.skip_gutenberg_header(fp)

        for line in fp:
            for word in line.rstrip().split():
                self.process_word(word, order)

    # def skip_gutenberg_header(self, fp):
    #     for line in fp:
    #         if line.startswith('*END*THE SMALL PRINT!'):
Exemple #30
0
#coding:utf8
import os
import sys
import logging

from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.tag import StanfordPOSTagger

java_path = "C:\\Program Files\\Java\\jdk1.8.0_73\\bin\\java.exe"
os.environ['JAVAHOME'] = java_path

segmenter = StanfordSegmenter(
    path_to_jar=
    "E:\\lib\\stanford-segmenter-2017-06-09\\stanford-segmenter-3.8.0.jar",
    path_to_slf4j="E:\\lib\\stanford-segmenter-2017-06-09\\slf4j-api.jar",
    path_to_sihan_corpora_dict="E:\\lib\\stanford-segmenter-2017-06-09\\data",
    path_to_model="E:\\lib\\stanford-segmenter-2017-06-09\\data\\pku.gz",
    path_to_dict=
    "E:\\lib\\stanford-segmenter-2017-06-09\\data\\dict-chris6.ser.gz")

postagger = StanfordPOSTagger(
    path_to_jar=
    "E:\\lib\\stanford-postagger-full-2017-06-09\\stanford-postagger.jar",
    model_filename=
    'E:\\lib\\stanford-postagger-full-2017-06-09\\models\\chinese-distsim.tagger',
)


def pos_to_sequence(sent, segmenter=segmenter, postagger=postagger):
    seg_sent = segmenter.segment(sent)
Exemple #31
0
        print "{}: {}".format(key, value)


if __name__ == '__main__':

    examples = []

    with open(pjoin(args.data_dir, args.data_file), 'rb') as f:
        for line in f:
            examples.append(line)

    if args.corpus == "gigaword_ch" and not args.char:
        print "segmenting each example for Chinese, could take a while"
        from nltk.tokenize.stanford_segmenter import StanfordSegmenter

        seg = StanfordSegmenter(path_to_slf4j=path_to_slf4j,
                                path_to_jar=path_to_jar)
        seg.default_config('zh')

    # ==== Filtering =====
    data_dist = {}
    filtered_examples = {}
    number_of_filtered_examples = 0
    for i, ex in enumerate(examples):
        s1, s2, label = ex[:-1].split('\t')

        if args.corpus == 'gigaword_ch':
            s1 = s1.replace(' .', '。')  # parser appended normal period
            s2 = s2.replace(' .', '。')

        if args.char and args.corpus == "gigaword_ch":
            # we presplit into chars
import os, time

from nltk.tokenize.stanford_segmenter import StanfordSegmenter
start = time.time()

jar = '/Users/sinansmac/Public/StanfordNLP/stanford-segmenter-2017-06-09/stanford-segmenter.jar'
api_jar = '/Users/sinansmac/Public/StanfordNLP/stanford-parser-full-2017-06-09/slf4j-api.jar'
# dict = '/Users/sinansmac/Public/StanfordNLP/stanford-segmenter-2017-06-09/data/dict-chris6.ser.gz'

seg = StanfordSegmenter(path_to_jar=jar, path_to_slf4j=api_jar)
seg.default_config('zh')

# sent = u'这是斯坦福中文分词器测试'
# print(seg.segment(sent))

fp = "chinese.txt"
tokenstr = seg.segment_file(fp)
token_ls = list(tokenstr)
print(len(token_ls), '\n', tokenstr, '\n', token_ls)

# with open('chinese_tokens.txt', 'a') as writef:
#     for line in token_ls:
#         writef.write(line.rstrip().split())

# print(tokens, '\n', type(tokens)) # class 'str'

end = time.time()

print("process time:", round(end - start))
		elif opt in ("-d", "--idir"):
			inputdir = arg
		elif opt in ("-t"):
			data_type = arg
		elif opt in ("-o", "--ofile"):
			outputfile = arg

	if inputdir == '':
		print 'test.py -t <datatype> -d <inputdir> -o <outputfile>'
		sys.exit(2)

	if outputfile == '':
		outputfile = 'vocab.out'
	#########################

	segmenter = StanfordSegmenter(path_to_jar="../stanford-segmenter-2015-12-09/stanford-segmenter-3.6.0.jar", path_to_slf4j = "../stanford-segmenter-2015-12-09/slf4j-api.jar", path_to_sihan_corpora_dict="../stanford-segmenter-2015-12-09/data", path_to_model="../stanford-segmenter-2015-12-09/data/pku.gz", path_to_dict="../stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz")
	
	vocabSet = set([])
	build_time = 0.
	total = count_em(inputdir)
	for dirPath, dirNames, fileNames in os.walk(inputdir):
		if len(fileNames) > 0 :
			sumContain = ''
			for f in fileNames:
				try:
					if data_type == 'CIRB010':
						root = ET.parse(dirPath+'/'+f).getroot()
						date = root[0][1].text.strip()
						title = root[0][2].text.strip()
						text = ''
						for p in root[0][3]: