def segment_sentences(sentence_list):
    segmenter = StanfordSegmenter(
        java_class=r"edu.stanford.nlp.ie.crf.CRFClassifier",
        path_to_jar=os.path.join(stanford_corenlp_path,
                                 'stanford-segmenter-2018-02-27',
                                 'stanford-segmenter-3.9.1.jar'),
        path_to_slf4j=os.path.join(stanford_corenlp_path,
                                   'slf4j-api-1.7.25.jar'),
        path_to_sihan_corpora_dict=os.path.join(
            stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'data'),
        path_to_model=os.path.join(stanford_corenlp_path,
                                   'stanford-segmenter-2018-02-27', 'data',
                                   'pku.gz'),
        path_to_dict=os.path.join(stanford_corenlp_path,
                                  'stanford-segmenter-2018-02-27', 'data',
                                  'dict-chris6.ser.gz'),
        sihan_post_processing='true')
    result = segmenter.segment_sents(sentence_list)
    result = result.strip()
    segmented_list = re.split(os.linesep, result)
    if len(segmented_list[-1]) == 0:
        segmented_list = segmented_list[:-1]
    if len(segmented_list) != len(sentence_list):
        for i in range(len(segmented_list)):
            ss = ''.join(segmented_list[i].split())
            if ss != sentence_list[i]:
                print(i, '|', segmented_list[i], '|', sentence_list[i])
                # break
        print(len(segmented_list), len(sentence_list))
    assert len(segmented_list) == len(sentence_list)
    return segmented_list
Exemple #2
0
class StanfordTokenizer:
    """
    class for segmenting Chinese sentences
    uses stanford segmenter 3.9.1
    """
    def __init__(self):
        stanford_corenlp_path = r'/media/mcislab/sdb1/home/mcislab/zwt/stanford_core_nlp'
        self.segmenter = StanfordSegmenter(
            java_class=r"edu.stanford.nlp.ie.crf.CRFClassifier",
            path_to_jar=os.path.join(stanford_corenlp_path,
                                     'stanford-segmenter-2018-02-27',
                                     'stanford-segmenter-3.9.1.jar'),
            path_to_slf4j=os.path.join(stanford_corenlp_path,
                                       'slf4j-api-1.7.25.jar'),
            path_to_sihan_corpora_dict=os.path.join(
                stanford_corenlp_path, 'stanford-segmenter-2018-02-27',
                'data'),
            path_to_model=os.path.join(stanford_corenlp_path,
                                       'stanford-segmenter-2018-02-27', 'data',
                                       'pku.gz'),
            path_to_dict=os.path.join(stanford_corenlp_path,
                                      'stanford-segmenter-2018-02-27', 'data',
                                      'dict-chris6.ser.gz'),
            sihan_post_processing='true')

    def segment_sents(self, sentences):
        result = self.segmenter.segment_sents(sentences)
        result = result.strip()
        segmented_list = re.split(os.linesep, result)
        if len(segmented_list[-1]) == 0:
            segmented_list = segmented_list[:-1]
        print(len(segmented_list), len(sentences))
        assert len(segmented_list) == len(sentences)
        return segmented_list

    def tokenize(self, captions_for_images):
        image_id_list = []
        caption_list = []
        for (image_id, captions) in captions_for_images.items():
            for caption in captions:
                caption_list.append(caption['caption'])
                image_id_list.append(image_id)

        segmented_caption_list = self.segment_sents(caption_list)
        assert len(image_id_list) == len(caption_list) and len(
            caption_list) == len(segmented_caption_list)

        tokenized_captions_for_images = {}
        for i in range(len(image_id_list)):
            image_id = image_id_list[i]
            if image_id not in tokenized_captions_for_images:
                tokenized_captions_for_images[image_id] = []
            tokenized_captions_for_images[image_id].append(
                segmented_caption_list[i])
        return tokenized_captions_for_images
def segment():
    """
    split a Chinese sentence into words
    :return:
    """
    segmenter = StanfordSegmenter(
        java_class=r"edu.stanford.nlp.ie.crf.CRFClassifier",
        path_to_jar=r"D:\Desktop\stanford corenlp\stanford-segmenter-2018-02-27\stanford-segmenter-3.9.1.jar",
        path_to_slf4j=r"D:\Desktop\stanford corenlp\slf4j-api-1.7.25.jar",
        path_to_sihan_corpora_dict=r"D:\Desktop\stanford corenlp\stanford-segmenter-2018-02-27\data",
        path_to_model=r"D:\Desktop\stanford corenlp\stanford-segmenter-2018-02-27\data\pku.gz",
        path_to_dict=r"D:\Desktop\stanford corenlp\stanford-segmenter-2018-02-27\data\dict-chris6.ser.gz",
        sihan_post_processing='true'
    )   # path to jar files should be changed

    # result = segmenter.segment(s)
    result = segmenter.segment_sents(["一个人在切西红柿", "这个把手该换了", "别把手放在我的肩膀上", "他正在量和服尺寸"])

    print(result)
                examples += filtered_examples[label]

    print "total number in produced dataset: {}".format(len(examples))

    # now we word segment for Chinese
    if args.corpus == "gigaword_ch" and not args.char:
        s1_list, s2_list, labels = [], [], []
        for ex in examples:
            s1, s2, label = ex.split('\t')

            s1_list.append(s1.decode('utf-8'))
            s2_list.append(s2.decode('utf-8'))
            labels.append(label)

        logging.info("s1, s2 collected, segmentation begins")
        s1_list = seg.segment_sents(s1_list)
        s1_list = s1_list.split('\n')[:-1]
        logging.info("s1 segmented")

        s2_list = seg.segment_sents(s2_list)
        s2_list = s2_list.split('\n')[:-1]
        logging.info("s2 segmented")

        examples = []
        assert len(s1_list) == len(s2_list) == len(labels)
        for i in range(len(s1_list)):
            example_line = "\t".join([s1_list[i], s2_list[i],
                                      labels[i]])  # label has '\n'
            examples.append(
                example_line
            )  # no need to encode in utf-8 anymore, seg produces utf-8
class NLPCore:
    """
    nlp processing including Stanford Word Segmenter, Stanford POS Tagger, 
    Stanford Named Entity Recognizer and Stanford Parser 
    """
    def __init__(self):
        self.root_path = '../Models/stanfordNLP/'

        # word segmenter
        self.segmenter = StanfordSegmenter(
            path_to_jar=self.root_path + "stanford-segmenter.jar",
            path_to_slf4j=self.root_path + "log4j-over-slf4j.jar",
            path_to_sihan_corpora_dict=self.root_path + "segmenter/",
            path_to_model=self.root_path + "segmenter/pku.gz",
            path_to_dict=self.root_path + "segmenter/dict-chris6.ser.gz")

        # pos tagger
        self.posTagger = StanfordPOSTagger(
            self.root_path + 'pos-tagger/chinese-distsim.tagger',
            path_to_jar=self.root_path + "stanford-postagger.jar")

        # named entity recognizer
        self.nerTagger = StanfordNERTagger(
            self.root_path + 'ner/chinese.misc.distsim.crf.ser.gz',
            path_to_jar=self.root_path + 'stanford-ner.jar')

        self.parser = StanfordDependencyParser(
            model_path=self.root_path + 'lexparser/chinesePCFG.ser.gz',
            path_to_jar=self.root_path + 'stanford-parser.jar',
            path_to_models_jar=self.root_path +
            'stanford-parser-3.7.0-models.jar',
            encoding='gbk')

    def split_sent_stanford(self, textPair):
        """
        Stanford Word Segmenter, input should be raw text
        :return: also TextPair with raw string of results
        """
        t1 = self.segmenter.segment(textPair.t1)
        t2 = self.segmenter.segment(textPair.t1)

        if DEBUG:
            print(t1, t2)

        return text_pair.TextPair(t1, t2, textPair.label)

    def split_sents_stanford(self, textPairs):
        """
        Stanford Word Segmenter, input should be list of sents
        :return: also TextPair with raw string of results
        """
        sents1 = [textPair.t1 for textPair in textPairs]
        sents2 = [textPair.t2 for textPair in textPairs]

        split1 = self.segmenter.segment_sents(sents1).split('\n')
        split2 = self.segmenter.segment_sents(sents2).split('\n')

        rlist = []
        for i in range(len(textPairs)):
            rlist.append(
                text_pair.TextPair(split1[i], split2[i], textPairs[i].label))

            if DEBUG:
                print(split1[i], split2[i])

        return rlist

    def split_sent_jieba(self, textPair):

        jieba.setLogLevel('INFO')
        ger1 = jieba.cut(textPair.t1)
        ger2 = jieba.cut(textPair.t2)

        t1 = ' '.join(ger1)
        t2 = ' '.join(ger2)

        return text_pair.TextPair(t1, t2, textPair.label)

    def pos_tag(self, textPair):
        """
        Stanford POS Tagger, input should be splitted
        :return: also TextPair with raw string of results
        """
        t1_s = textPair.t1.split()
        t2_s = textPair.t2.split()

        t1_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t1_s)])
        t2_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t2_s)])

        if DEBUG:
            print(t1_tag, t2_tag)

        return text_pair.TextPair(t1_tag, t2_tag, textPair.label)

    def pos_tag_pairs(self, textPairs):
        """
        Stanford POS Tagger, input should be list of sents
        :return: also TextPair with raw string of results
        """
        sents1 = [textPair.t1.split() for textPair in textPairs]
        sents2 = [textPair.t2.split() for textPair in textPairs]

        tag1 = self.posTagger.tag_sents(sents1)
        tag2 = self.posTagger.tag_sents(sents2)

        rlist = []
        for i in range(len(tag1)):
            t1_tag = ' '.join([ele[1] for ele in tag1[i]])
            t2_tag = ' '.join([ele[1] for ele in tag2[i]])

            rlist.append(text_pair.TextPair(t1_tag, t2_tag,
                                            textPairs[i].label))

            if DEBUG:
                print(t1_tag, t2_tag)

        return rlist

    def ner_tag(self, textPair):
        """
        Stanford Named Entity Recognizer, input should be splitted
        :return: also TextPair with raw string of results
        """
        t1_s = textPair.t1.split()
        t2_s = textPair.t2.split()

        t1_ner = ' '.join(
            [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t1_s)])
        t2_ner = ' '.join(
            [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t2_s)])

        if DEBUG:
            print(t1_ner, t2_ner)

        return text_pair.TextPair(t1_ner, t2_ner, textPair.label)

    def ner_tag_pairs(self, textPairs):
        """
        Stanford Named Entity Recognizer, input should be list of sents
        :return: also TextPair with raw string of results
        """
        sents1 = [textPair.t1.split() for textPair in textPairs]
        sents2 = [textPair.t2.split() for textPair in textPairs]

        tag1 = self.nerTagger.tag_sents(sents1)
        tag2 = self.nerTagger.tag_sents(sents2)

        rlist = []
        for i in range(len(tag1)):
            t1_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag1[i]])
            t2_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag2[i]])

            rlist.append(text_pair.TextPair(t1_ner, t2_ner,
                                            textPairs[i].label))

            if DEBUG:
                print(t1_ner, t2_ner)

        return rlist

    def depen_parse(self, textPair):
        """
        Stanford Dependency Parser, input should be splitted
        :return: also TextPair with raw string of results
        """
        print([p.tree() for p in self.parser.raw_parse(textPair.t1)])