Esempio n. 1
0
class SynsetExample(object):
    def __init__(self):
        self.synset_example = {}
        self.tokenizer = CoreNLPParser(url='http://localhost:42636')

        self.use_babelnet = use_extended_gloss
        if self.use_babelnet:
            from py4j.java_gateway import JavaGateway
            gateway = JavaGateway()
            self.sense = gateway.entry_point

    def __getitem__(self, name):
        if name not in self.synset_example:
            self.synset_example[name] = self.get_synset_example(name)

        return self.synset_example[name]

    def get_synset_example(self, name):
        synset = wn.synset(name)
        if self.use_babelnet:
            synset_pos = synset.pos()
            if synset_pos == "s":
                synset_pos = "a"
            synset_id = 'wn:{}{}'.format(
                str(synset.offset()).zfill(8), synset_pos)
            example = self.sense.getExampleByWnSynsetId(synset_id)
            if not example:
                example = " ".join(synset.examples()).strip()
        else:
            example = " ".join(synset.examples()).strip()
        return [x.lower() for x in self.tokenizer.tokenize(example)]
def create_dataset_bin(annotation_file, data_file):
    parser = CoreNLPParser(url='http://localhost:9080')
    dirname = os.path.dirname(os.path.realpath(__file__)) + "/"

    dataset = []

    with open(annotation_file, "r") as file1, open(data_file, "r") as file2:
        for line_from_file_1, line_from_file_2 in zip(file1, file2):
            output = None
            line1 = line_from_file_1.split()
            line2 = line_from_file_2
            if line1[0] == "ne":
                output = 7
            elif line1[0] == "hp":
                output = 0
            elif line1[0] == "sd":
                output = 1
            elif line1[0] == "ag":
                output = 2
            elif line1[0] == "dg":
                output = 3
            elif line1[0] == "sp":
                output = 4
            elif line1[0] == "fr":
                output = 5
            elif line1[0] == "me":
                output = 6
            dataset.append((output, list(parser.tokenize(line2))))
    print(len(dataset))

    with open(dirname + "Pickle/dataset_ready", 'wb') as outfile:
        cPickle.dump(dataset, outfile)
Esempio n. 3
0
def start_testing(trained_model_file):
    parser = CoreNLPParser(url='http://localhost:9080')

    emotions = ['happiness', 'sadness', 'anger', 'disgust', 'surprise', 'fear']

    dirname = os.path.dirname(os.path.realpath(__file__)) + "/"

    glove_model = read_glove_vectors(dirname + "Pickle/gloveModel")

    hidden_size = 256
    num_layers = 2
    bidirectional = False
    batchnorm = False
    dropout_hidden = 0.3
    dropout_output = 0.9
    model = LSTM(300, hidden_size, num_layers, bidirectional, batchnorm,
                 dropout_hidden, dropout_output).to(device)

    with torch.no_grad():
        model.load_state_dict(torch.load(trained_model_file))
        print(model)
        model.eval()
        while True:
            test_sentence = input("Give a test sentence: ")
            sentence = list(parser.tokenize(test_sentence))
            input1, sent_length = get_input_vector(glove_model, sentence)
            class_pred = model(input1, sent_length)
            print("Sentence: " + test_sentence)
            _, pred = class_pred.max(dim=1)
            print("Prediction:\t" + emotions[pred[0]])
            print("Output Values:")
            percentages = torch.nn.functional.softmax(class_pred, dim=1) * 100
            for i in range(len(emotions)):
                print(emotions[i] + " %" +
                      str(percentages.data.tolist()[0][i]))
Esempio n. 4
0
def build_vocab(json:str, threshold:int, keeppunctuation: bool, host_address:str, character_level:bool=False, zh:bool=True ):
    """Build vocabulary from csv file with a given threshold to drop all counts < threshold

    Args:
        csv (string): Input csv file. Needs to be tab separated and having a column named 'caption'
        
        Modiefied:
        json(string): Input json file. Shoud have a column named 'caption'
        threshold (int): Threshold to drop all words with counts < threshold
        keeppunctuation (bool): Includes or excludes punctuation.

    Returns:
        vocab (Vocab): Object with the processed vocabulary
    """
    #df = pd.read_csv(csv, sep='\t')
    df = pd.read_json(json)
    counter = Counter()
    
    if zh:
        parser = CoreNLPParser(host_address)
        for i in tqdm(range(len(df)), leave=False):
            caption = str(df.loc[i]['caption'])
            # Remove all punctuations
            if not keeppunctuation:
                caption = re.sub("[{}]".format(punctuation),"",caption)
            if character_level:
                tokens = list(caption)
            else:
                tokens = list(parser.tokenize(caption))
            counter.update(tokens)
    else:
        punctuation = ',.()'
        for i in tqdm(range(len(df)), leave=False):
            caption = str(df.loc[i]['caption'])
            # Remove all punctuations
            if not keeppunctuation:
                caption = re.sub("[{}]".format(punctuation),"",caption)
            if character_level:
                tokens = list(caption)
            else:
                tokens = caption.split()
            counter.update(tokens)

    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab
Esempio n. 5
0
class NLTK_NLP():

    def __init__(self, ip_port):
        self.dep_parser = CoreNLPDependencyParser(url=ip_port)
        self.ner_parser = CoreNLPParser(url=ip_port, tagtype='ner')
        self.parser = CoreNLPParser(url=ip_port)
        self.pos_tagger = CoreNLPParser(url=ip_port, tagtype='pos')

    def generate_dependency_tree(self, sentence):
        '''what is the name of the asteroid ?'''
        dependency_tree, = self.dep_parser.raw_parse(sentence=sentence)
        return dependency_tree

    def generate_dependency_graph(self, sentence):
        '''12 {'address': 12, 'word': '.', 'lemma': '.', 'ctag': '.', 'tag': '.', 'feats': '', 'head': 1, 'deps': defaultdict(<class 'list'>, {}), 'rel': 'punct'}
        7-tuple, where the values are ``word, lemma, ctag, tag, feats, head, rel``.'''
        dependency_tree, = self.dep_parser.raw_parse(sentence=sentence)
        return DependencyGraph(dependency_tree.to_conll(10))

    def generate_constituency_tree(self, sentence):
        '''input: one question'''
        tree_list = list(self.parser.raw_parse(sentence=sentence))
        return tree_list[0]

    def get_pos(self, sentence):
        '''What is the airspeed of an unladen swallow ?
        [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), 'airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
        '''
        pos_list = list(self.pos_tagger.tag(sentence.split()))
        # tokens = nltk.word_tokenize(sentence)
        # wordpos = nltk.pos_tag(tokens)
        return pos_list

    def get_pos_by_tokens(self, tokens):
        '''What is the airspeed of an unladen swallow ?'''
        pos_list = list(self.pos_tagger.tag(tokens))
        return pos_list

    def get_ner(self, sentence):
        # tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
        '''april the 26th, 1882 is the birth date of which athletes ?
        [('april', 'DATE'), ('the', 'DATE'), ('26th', 'DATE'), (',', 'DATE'), ('1882', 'DATE'),
        ('is', 'O'), ('the', 'O'), ('birth', 'O'), ('date', 'O'), ('of', 'O'), ('which', 'O'),
        ('athletes', 'O'), ('?', 'O')]'''
        sequence_ner_tuple_list = self.ner_parser.tag(sentence.split())
        sequence_ner_list = []
        for i, (word, ner_tag) in enumerate(sequence_ner_tuple_list):
            sequence_ner_list.append(ner_tag)
        return sequence_ner_list

    def get_toknizer(self, sentence):
        return list(self.parser.tokenize(sentence))

    def find_phrases(self, tree, phrase_tag='NP'):
        return [subtree.leaves() for subtree in tree.subtrees(lambda t: t.label()==phrase_tag)]
Esempio n. 6
0
class Lex_parser:
    def __init__(self, tag_id_initialized=False, tag_id=None, uncased=True):
        self.uncased = uncased
        self.tag_id_initialized = tag_id_initialized
        if tag_id_initialized:
            self.tag_to_id = tag_id
        else:
            self.tag_to_id = {"CLSSEP": 0, "UNKNOWN": 1}
        self.parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
        self.basic_tokenizer = BasicTokenizer()

    def tokenize(self, sentence):
        return list(self.parser.tokenize(sentence))

    def convert_sentence_to_tags(self, sentence: Union[str, list]):
        if type(sentence) == str:
            if self.uncased:
                sentence = sentence.lower()

        else:
            sentence = " ".join(sentence)
            if self.uncased:
                sentence = sentence.lower()

        sentence = self.basic_tokenizer.tokenize(sentence)

        # print("sentence here,", sentence)
        sentence = list(map(lambda x: x.upper() if x == 'i' else x, sentence))
        tags = self.parser.tag(sentence)
        # print("sentence here,", sentence)
        # print("tags here", tags)
        # exit(-2)
        if not self.tag_id_initialized:
            for tag in tags:
                if tag[1] not in self.tag_to_id:
                    self.tag_to_id[tag[1]] = len(self.tag_to_id)
        return tags

    def convert_tags_to_ids(self, tags):
        res = list(map(lambda x: self.tag_to_id[x[1]], tags))
        # print("to ids ==")
        # print(len(tags), tags)
        # print(len(res), res)
        return res

    def convert_sentence_to_ids(self, sentence: Union[str, list]):
        if not self.parser:
            self.parser = CoreNLPParser(url='http://localhost:9000',
                                        tagtype='pos')

        tags = self.convert_sentence_to_tags(sentence)
        ids = self.convert_tags_to_ids(tags)
        print(type(sentence), len(sentence), len(tags), len(ids))
        return list(ids)
Esempio n. 7
0
def tokenize_and_write_to_tokenresult(text, dest):
    #https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK
    url = 'http://localhost:9000'
    dep_parser = CoreNLPDependencyParser(url=url)
    tokens = CoreNLPParser(url)

    res = tokens.tokenize(text)

    for token in res:
        if token == '.':
            dest.write(token.lower() + '\n')
        else:
            dest.write(token.lower() + ' ')
Esempio n. 8
0
def process(input_labels_csv: str,
            output_file: str,
            hostname="http://localhost:9000",
            character_level: bool = False):
    captions = pd.read_csv(input_labels_csv, sep='\t', encoding='utf-8')
    parser = CoreNLPParser(hostname)
    captions = captions[captions.caption.notnull()]
    captions['tokens'] = None
    for idx, row in tqdm(captions.iterrows(), total=len(captions)):
        caption = row['caption']
        # Remove punctuation
        caption = re.sub("[{}]".format(punctuation), "", caption)
        if character_level:
            captions.at[idx, 'tokens'] = list(caption)
        else:
            captions.at[idx, 'tokens'] = list(parser.tokenize(caption))
    captions.to_json(output_file)
Esempio n. 9
0
def tokenize(text, url='http://localhost:9000'):
    """CoreNLP 分词

    Parameters
    ----------
    text : str
        要分词的文本
    url : str, optional
        CoreNLP Web 服务器的 URL (default: 'http://localhost:9000')
    parser : 用于编成接口,不要在命令行使用

    Returns
    -------
    str
        返回空格分隔 token 的句子
    """

    parser = CoreNLPParser(url)
    tokens = list(parser.tokenize(text))
    return ' '.join(tokens)
def tokenize_tweet(t):
    """
    Use the Stanford's PTBTokenizer to tokenize the tweet.
    Requires Stanford CoreNLP Server to be running.
    For setup information see: [
        https://stanfordnlp.github.io/CoreNLP/index.html,
        https://www.khalidalnajjar.com/setup-use-stanford-corenlp-server-python/
    ]
    Here we use the wrapper from `nltk` package instead of stanfordcorenlp

    From the directory where you setup the stanford NLP, Run the server:
    java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 30000

    Args:
        t: The tweet to be tokenized

    Returns:
        t (generator): A generator for the list of tokens generated from the tweet.
    """
    parser = CoreNLPParser(url='http://localhost:9000/')

    return parser.tokenize(t)
Esempio n. 11
0
class StanfordNLTKWrapper:
    def __init__(self, config_file_path='aida_event/config/xmie.json'):
        self._config = read_dict_from_json_file(config_file_path)
        self._domain_name = self._config['common_tools']['stanford_url']
        self._port_number = self._config['common_tools']['stanford_port']
        self._pos_model = self._config['common_tools']['stanford_pos_model']
        self._pos_jar = self._config['common_tools']['stanford_pos_jar']
        self._parser_model = self._config['common_tools'][
            'stanford_parser_model']
        self._parser_jar = self._config['common_tools']['stanford_parser_jar']

        self._core_nlp_parser = CoreNLPParser(
            url='%s:%s' % (self._domain_name, self._port_number))
        self._pos_tagger = StanfordPOSTagger(model_filename=self._pos_model,
                                             path_to_jar=self._pos_jar)
        self._dep_parser = StanfordDependencyParser(
            path_to_jar=self._parser_jar,
            path_to_models_jar=self._parser_model,
            java_options='-Xmx16G')

    def tokenizer(self, input_text):
        return list(self._core_nlp_parser.tokenize(input_text))

    def pos_tag(self, input_tokenized_sentence):
        return self._pos_tagger.tag(input_tokenized_sentence)

    def pos_tag_sentences(self, input_tokenized_sentences):
        return self._pos_tagger.tag_sents(input_tokenized_sentences)

    def dependency_parser(self, input_tokenized_pos_tagged_sentence):
        return self._dep_parser.tagged_parse(
            input_tokenized_pos_tagged_sentence)

    def dependency_parser_sentences(self,
                                    input_tokenized_pos_tagged_sentences):
        return self._dep_parser.tagged_parse_sents(
            input_tokenized_pos_tagged_sentences)
Esempio n. 12
0
class CoreNLPTokenizer(Tokenizer):
    def __init__(
        self,
        url: str = 'http://localhost:9000',
        encoding: str = 'utf-8',
        start_tokens: List[str] = None,
        end_tokens: List[str] = None,
    ):
        self._parser = CoreNLPParser(url, encoding, 'pos')

        self._start_tokens = start_tokens or []
        # We reverse the tokens here because we're going to insert them with `insert(0)` later;
        # this makes sure they show up in the right order.
        self._start_tokens.reverse()
        self._end_tokens = end_tokens or []

    @overrides
    def tokenize(self, text: str) -> List[Token]:

        tokens = [Token(t) for t in self._parser.tokenize(text)]

        for start_token in self._start_tokens:
            if isinstance(start_token, int):
                token = Token(text_id=start_token, idx=0)
            else:
                token = Token(text=start_token, idx=0)
            tokens.insert(0, token)

        for end_token in self._end_tokens:
            if isinstance(end_token, int):
                token = Token(text_id=end_token, idx=0)
            else:
                token = Token(text=end_token, idx=0)
            tokens.append(token)

        return tokens
Esempio n. 13
0
 def test_connection(self):
     st = CoreNLPParser()
     print('testing connection...')
     list(st.tokenize("test"))
     print('Server ready!')
Esempio n. 14
0
def build_vocab(input_json: str,
                threshold: int,
                keep_punctuation: bool,
                host_address: str,
                character_level: bool = False,
                zh: bool = True ):
    """Build vocabulary from csv file with a given threshold to drop all counts < threshold

    Args:
        input_json(string): Preprossessed json file. Structure like this: 
            {
              'audios': [
                {
                  'audio_id': 'xxx',
                  'captions': [
                    { 
                      'caption': 'xxx',
                      'cap_id': 'xxx'
                    }
                  ]
                },
                ...
              ]
            }
        threshold (int): Threshold to drop all words with counts < threshold
        keep_punctuation (bool): Includes or excludes punctuation.

    Returns:
        vocab (Vocab): Object with the processed vocabulary
"""
    data = json.load(open(input_json, "r"))["audios"]
    counter = Counter()
    
    if zh:
        from nltk.parse.corenlp import CoreNLPParser
        from zhon.hanzi import punctuation
        parser = CoreNLPParser(host_address)
        for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
            for cap_idx in range(len(data[audio_idx]["captions"])):
                caption = data[audio_idx]["captions"][cap_idx]["caption"]
                # Remove all punctuations
                if not keep_punctuation:
                    caption = re.sub("[{}]".format(punctuation), "", caption)
                if character_level:
                    tokens = list(caption)
                else:
                    tokens = list(parser.tokenize(caption))
                data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens)
                counter.update(tokens)
    else:
        punctuation = ',.():;?!"\''
        for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
            for cap_idx in range(len(data[audio_idx]["captions"])):
                caption = data[audio_idx]["captions"][cap_idx]["caption"].lower()
                # Remove all punctuations
                if not keep_punctuation:
                    caption = re.sub("[{}]".format(punctuation), " ", caption)
                caption = re.sub(" +", " ", caption)
                if character_level:
                    tokens = list(caption)
                else:
                    tokens = caption.split()
                data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens)
                counter.update(tokens)

    json.dump({ "audios": data }, open(input_json, "w"), indent=4)
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word("<pad>")
    vocab.add_word("<start>")
    vocab.add_word("<end>")
    vocab.add_word("<unk>")

    # Add the words to the vocabulary.
    for word in words:
        vocab.add_word(word)
    return vocab
Esempio n. 15
0
        km.fit(self.E)
        self.T = km.cluster_centers_.astype(np.float32)
        self.T /= np.linalg.norm(self.T, axis=-1, keepdims=True)
        return self


if __name__ == '__main__':
    w2v = word2vec('../../data/tokenized.txt')
    w2v.embed('../word_vector/yelp.w2v', 400)
    print("Word2Vec training finished.")

    origin_path = '../../data/converted'
    dest_path = '../../data/absc_encoded'
    names = ['train.txt', 'dev.txt', 'test.txt']

    for name in names:
        print(f"Processing {name}...")
        with open(os.path.join(origin_path, name), 'r') as f:
            lines = f.readlines()
        with open(os.path.join(dest_path, name), 'w') as f:
            for line in tqdm(lines):
                review = line.split('\t')[0]
                score = line.split('\t')[1]
                tokens = tokenizer.tokenize(review)
                indices = [
                    str(w2v.w2i[token])
                    if token in w2v.w2i else str(w2v.w2i['<unk>'])
                    for token in tokens
                ]
                f.write(' '.join(indices) + '\t' + score)
Esempio n. 16
0
    def _execute(args):
        index, line = args
        if data_format == 'jsonl':
            d = json.loads(line)
            label = d.get('gold_label', '-').strip().lower()
            sent1 = d['sentence1'].strip()
            sent2 = d['sentence2'].strip()
        elif data_format == 'tsv':
            l = line.split('\t')
            if corpus_type == 'snli':
                label = l[0].strip().lower()
                sent1 = l[5].strip()
                sent2 = l[6].strip()
            elif corpus_type == 'xnli':
                sent1 = l[0].strip()
                sent2 = l[1].strip()
                label = l[2].strip().lower()
            else:
                raise ValueError(f'无效的 `corpus_type`: {corpus_type}')
        else:
            raise ValueError(f'无效的 `data_format`: {data_format}')
        if label == 'contradictory':
            label = 'contradiction'

        if (not sent1) or (not sent2):
            tqdm.write(f'第[{index}]行: 空白语料,将被忽略. {line}', file=sys.stderr)
            return

        sent1 = remove_cjk_whitespace(sent1)
        sent2 = remove_cjk_whitespace(sent2)

        segments = []
        if corenlp:
            parser = CoreNLPParser(corenlp)
            for sent in (sent1, sent2):
                tokens = list(parser.tokenize(sent))
                segments.append(' '.join(tokens))
        elif ltp:
            for sent in (sent1, sent2):
                tokens = []
                r = requests.post(ltp, data={'s': sent, 'x': 'n', 't': 'ws'})
                r.raise_for_status()
                ltp_result = r.json()
                for ltp_sent in ltp_result[0]:
                    for ltp_w in ltp_sent:
                        ws = ltp_w['cont'].strip()
                        if ws:
                            tokens.append(ws)
                segments.append(' '.join(tokens))

        result = json.dumps(
            {
                'index': index,
                'gold_label': label,
                'sentence1': segments[0],
                'sentence2': segments[1],
            },
            ensure_ascii=False)

        if output_file:
            with lock:
                print(result, file=f_out, flush=flush)
        else:
            tqdm.write(result)
Esempio n. 17
0
def build_vocab(df: pd.DataFrame,
                threshold: int,
                keeppunctuation: bool,
                host_address: str,
                character_level: bool = False,
                zh: bool = True,
                pretokenized: bool = False):
    from nltk.parse.corenlp import CoreNLPParser
    from zhon.hanzi import punctuation
    """Build vocabulary from csv file with a given threshold to drop all counts < threshold

    Args:
        df (pd.DataFrame): Input dataframe. Shoud have a column named 'caption'
        threshold (int): Threshold to drop all words with counts < threshold
        keeppunctuation (bool): Includes or excludes punctuation.

    Returns:
        vocab (Vocab): Object with the processed vocabulary
    """
    counter = Counter()

    if pretokenized:
        assert "tokens" in df.columns, "Pretokenized words should be in the `token` column"

    if zh:
        parser = CoreNLPParser(host_address)
        for i in tqdm(range(len(df)), leave=False, ascii=True):
            if pretokenized:
                tokens = df.iloc[i]['tokens']
            else:
                caption = str(df.loc[i]['caption'])
                # Remove all punctuations
                if not keeppunctuation:
                    caption = re.sub("[{}]".format(punctuation), "", caption)
                if character_level:
                    tokens = list(caption)
                else:
                    tokens = list(parser.tokenize(caption))
            counter.update(tokens)
    else:
        punctuation = ',.()'
        for i in tqdm(range(len(df)), leave=False, ascii=True):
            if pretokenized:
                tokens = df.loc[i]['tokens']
            else:
                caption = str(df.loc[i]['caption'])
                # Remove all punctuations
                if not keeppunctuation:
                    caption = re.sub("[{}]".format(punctuation), "", caption)
                if character_level:
                    tokens = list(caption)
                else:
                    tokens = caption.split()
            counter.update(tokens)

    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab
Esempio n. 18
0
    path_to_model=
    "C:/Users/user/Desktop/NLP/sanford-segmenter-2015-12-09/data/pku.gz",
    path_to_dict=
    "C:/Users/user/Desktop/NLP/stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz",
)

text = ("这是斯坦福中文分词器测试")
segmenter.segment(text)

#這台跑不出來QQ

#%% NLTK CoreNLPParser
#必須先在cmd執行java(nlp start server.txt)
from nltk.parse.corenlp import CoreNLPParser
corenlp_parser = CoreNLPParser('http://localhost:9001', encoding='utf8')
token_list = list(corenlp_parser.tokenize(ptt_sim))

#%% thulac
import thulac

thu1 = thulac.thulac(seg_only=True)
thu1.cut(ptt_sim, text=True)
thu1.cut(news_sim, text=True)

#%% CKIWP
import os
import subprocess


def ckipws_tokenizes(input):
    os.chdir("C:/Users/user/Desktop/NLP/CKIPWS")
Esempio n. 19
0
def tokenize_caption(input_json: str,
                     keep_punctuation: bool = False,
                     host_address: str = None,
                     character_level: bool = False,
                     zh: bool = True ):
    """Build vocabulary from csv file with a given threshold to drop all counts < threshold

    Args:
        input_json(string): Preprossessed json file. Structure like this: 
            {
              'audios': [
                {
                  'audio_id': 'xxx',
                  'captions': [
                    { 
                      'caption': 'xxx',
                      'cap_id': 'xxx'
                    }
                  ]
                },
                ...
              ]
            }
        threshold (int): Threshold to drop all words with counts < threshold
        keep_punctuation (bool): Includes or excludes punctuation.

    Returns:
        vocab (Vocab): Object with the processed vocabulary
"""
    data = json.load(open(input_json, "r"))["audios"]
    
    if zh:
        from nltk.parse.corenlp import CoreNLPParser
        from zhon.hanzi import punctuation
        parser = CoreNLPParser(host_address)
        for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
            for cap_idx in range(len(data[audio_idx]["captions"])):
                caption = data[audio_idx]["captions"][cap_idx]["caption"]
                # Remove all punctuations
                if not keep_punctuation:
                    caption = re.sub("[{}]".format(punctuation), "", caption)
                if character_level:
                    tokens = list(caption)
                else:
                    tokens = list(parser.tokenize(caption))
                data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens)
    else:
        punctuation = ',.():;?!"\''
        for audio_idx in tqdm(range(len(data)), leave=False, ascii=True):
            for cap_idx in range(len(data[audio_idx]["captions"])):
                caption = data[audio_idx]["captions"][cap_idx]["caption"].lower()
                # Remove all punctuations
                if not keep_punctuation:
                    caption = re.sub("[{}]".format(punctuation), " ", caption)
                caption = re.sub(" +", " ", caption)
                if character_level:
                    tokens = list(caption)
                else:
                    tokens = caption.split()
                data[audio_idx]["captions"][cap_idx]["tokens"] = " ".join(tokens)

    json.dump({ "audios": data }, open(input_json, "w"), indent=4)
Esempio n. 20
0
#!/usr/bin/python3
# coding: utf-8
##################################################################
## CoreNLP
# server$ cd ~/datasets/Lib/CoreNLP/stanford-corenlp-full-2018-01-31
# server$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer \ -preload tokenize,ssplit,pos,lemma,parse,depparse \ -status_port 9000 -port 9000 -timeout 15000
# client$ ssh -fN -L 9000:localhost:9000 [email protected] -p 23622  # 将本地 9000 (left) 转向到 lab_server 的 9000 (right)
from nltk.parse.corenlp import CoreNLPParser
stanford = CoreNLPParser()
str = 'proved to be fake, made-up'
token = list(stanford.tokenize(str)); print(token)  # ['proved', 'to', 'be', 'fake', ',', 'made-up']
str = 'proved to    be fake, made-up'  # 空格不影响
token = list(stanford.tokenize(str)); print(token)  # ['proved', 'to', 'be', 'fake', ',', 'made-up']

# ../jptstanford_corenlp/l1_tokenizer.py 也有相同的功能, 但那个需要 root 权限, 很烦
Esempio n. 21
0
def segment_one(url, s):
    parser = CoreNLPParser(url, tagtype='pos')
    return list(parser.tokenize(pre_segment(s)))