Esempio n. 1
0
    def combine_stopwords(dataframe_in, stopword_dict):
        """
        Please use the stopwords() function and input that into the stopword_dict parameter.
        returns filtered tokens.
        :param dataframe_in:
        :param stopword_dict:
        :return:
        """
        nlp = spacy.load("en_core_web_lg")

        # Tokenizer
        tokenizer = Tokenizer(nlp.vocab)

        tokens = []

        for doc in tokenizer.pipe(dataframe_in, batch_size=500):

            doc_tokens = []

            for token in doc:
                if token.text.lower() not in stopword_dict:
                    doc_tokens.append(token.text.lower())

            tokens.append(doc_tokens)

        return tokens
    def predict():  #define a prediction function
        body = body.str.replace
        tokenizer = Tokenizer(nlp.vocab)
        tokens = []
        """ Make them tokens """
        #stop words
        STOP_WORDS = nlp.Defaults.stop_words.union(
            ['', ' ', '-', 'reddit', 'post'])
        tokens = []
        for doc in tokenizer.pipe(df['combo'], batch_size=500):
            doc_tokens = []
            for token in doc:
                if ((token.text.lower() not in STOP_WORDS)
                        and (token.is_stop == False)
                        and (token.is_punct == False)
                        and (token.pos_ != 'PRON')):
                    doc_tokens.append(token.lemma_.lower())
            tokens.append(' '.join(doc_tokens))

        df['tokens'] = tokens
        tfidf = TfidfVectorizer(min_df=0.025, max_df=.98, ngram_range=(1, 2))
        vec_text = tfidf.transform(user_input)
        output = model.predict(vec_text.todense())

        # give output to sender.
        return jsonify({"response": output})
Esempio n. 3
0
    def _tokenizer(df):
        nlp = English()
        tokenizer = Tokenizer(nlp.vocab)

        for doc in tokenizer.pipe(df.values.tolist(), batch_size=50):
            for token in doc:
                yield token
Esempio n. 4
0
def tokenize_data(input_data):
    nlp = spacy.load("en")
    tokenizer = Tokenizer(nlp.vocab)
    string_data = [str(data) for data in input_data]
    tokenized_data = [[str(w) for w in doc]
                      for doc in tokenizer.pipe(string_data, batch_size=50)]
    return tokenized_data
Esempio n. 5
0
def train_tokenizer_mldoc(train_size=1000, datapath='../data/', savepath=None):
    # Training a tokenizer for MLDoc dataset: create a vocabulary for each language in MLDoc
    languages = [
        "english", "german", "spanish", "french", "italian", "russian",
        "chinese", "japanese"
    ]
    mldoc_folder = os.path.join(datapath, 'mldoc')
    savepath = os.path.join(mldoc_folder,
                            'vocab') if savepath is None else savepath
    for lang in languages:
        print("\n\n\t\t *** Training tokenizer for {} ***".format(lang))
        train_f = os.path.join(mldoc_folder,
                               lang + '.train.{}'.format(train_size))
        print('loading data')
        train = pd.read_csv(train_f,
                            delimiter='\t',
                            header=None,
                            names=["label", "text"])
        print('data: {}'.format(train.shape))
        tokenizer = Tokenizer(language=lang,
                              train_list=train['text'].tolist(),
                              ngram_range=(1, 1),
                              min_freq=1,
                              max_freq_perc=1.0,
                              vocab_savefolder=savepath)
        print("creating new tokenizer")
        tokenizer2 = Tokenizer(language=lang, vocab_loadfolder=savepath)
        print('loaded vocab: {}'.format(len(tokenizer2.word2ind)))
Esempio n. 6
0
def train_tokenizer_twitter_sent(datapath='../data/', savepath=None):
    languages = [
        'arabic', 'bulgarian', 'german', 'english', 'spanish', 'persian',
        'croatian', 'hungarian', 'polish', 'portuguese', 'russian', 'slovak',
        'slovenian', 'swedish', 'uyghur', 'chinese'
    ]
    savepath = os.path.join(
        datapath, 'twitter_sent/vocab') if savepath is None else savepath
    for lang in languages:
        print("\n\n\t\t *** Training tokenizer for {} ***".format(lang))
        train = load_df_twitter_sent(method='train',
                                     language=lang,
                                     print_fn=print)
        print('data: {}'.format(train.shape))
        # NOTE: We use tokenizer_method = 'clean' because data are already tokenized
        tokenizer = Tokenizer(language=lang,
                              train_list=train['text'].tolist(),
                              tokenizer_method='clean',
                              remove_stopwords=False,
                              ngram_range=(1, 1),
                              min_freq=5,
                              max_freq_perc=1.0,
                              vocab_savefolder=savepath)
        print("creating new tokenizer")
        tokenizer2 = Tokenizer(language=lang,
                               vocab_loadfolder=savepath,
                               tokenizer_method='clean',
                               remove_stopwords=False)
        print('loaded vocab: {}'.format(len(tokenizer2.word2ind)))
    return
Esempio n. 7
0
def test_tokenizer_add_special_case_tag(text, tokens):
    vocab = Vocab()
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    tokenizer.add_special_case(text, tokens)
    doc = tokenizer(text)
    assert doc[0].text == tokens[0]["orth"]
    assert doc[0].norm_ == tokens[0]["norm"]
    assert doc[1].text == tokens[1]["orth"]
Esempio n. 8
0
 def __init__(self,
              rootDir='.cache',
              vectorPath='vectors',
              tokenizerPath='tokenizer'):
     self.vectorPath = Path.cwd() / rootDir / vectorPath
     self.tokenizerPath = Path.cwd() / rootDir / tokenizerPath
     self.tokenizer = Tokenizer(Vocab())
     self.vectors = Vectors(shape=(41299, 300))
Esempio n. 9
0
def test_tokenizer_add_special_case_tag(text, tokens):
    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    tokenizer.add_special_case(text, tokens)
    doc = tokenizer(text)
    assert doc[0].text == tokens[0]["orth"]
    assert doc[0].tag_ == tokens[0]["tag"]
    assert doc[0].pos_ == "NOUN"
    assert doc[1].text == tokens[1]["orth"]
Esempio n. 10
0
def test_tokenizer_flush_cache(en_vocab):
    suffix_re = re.compile(r"[\.]$")
    tokenizer = Tokenizer(
        en_vocab,
        suffix_search=suffix_re.search,
    )
    assert [t.text for t in tokenizer("a.")] == ["a", "."]
    tokenizer.suffix_search = None
    assert [t.text for t in tokenizer("a.")] == ["a."]
Esempio n. 11
0
 def transform(self, data):
     tokenizer = Tokenizer(nlp.vocab)
     return np.array([
         np.mean([
             self.model[w.text.lower()] * self.word2weight[w.text.lower()]
             for w in words if w.text.lower() in self.model
         ] or [np.zeros(self.dim)],
                 axis=0) for words in tokenizer.pipe(data)
     ])
Esempio n. 12
0
def test_tokenizer_add_special_case_tag(text, tokens):
    vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    tokenizer.add_special_case(text, tokens)
    doc = tokenizer(text)
    assert doc[0].text == tokens[0]["orth"]
    assert doc[0].tag_ == tokens[0]["tag"]
    assert doc[0].pos_ == "NOUN"
    assert doc[1].text == tokens[1]["orth"]
Esempio n. 13
0
def get_lemmas(text):

    # nlp = spacy.load("en_core_web_sm-2.2.5", path="airbnb_api/")
    nlp = spacy.load("en_core_web_sm-2.2.5", path="./")
    # nlp = spacy.load("en_core_web_sm")
    # nlp = en_core_web_sm.load()

    tokenizer = Tokenizer(nlp.vocab)

    STOP_WORDS = nlp.Defaults.stop_words.union([
        '  ', 'und', '-', 'die', 'der', 'berlin', 'ein', 'das', 'mit', 'ist',
        'im', 'zu', 'eine', 'es', 'für'
        'berlin.', 'zum', 'sind', 'für', 'Berlin.', '-pron-', 's', 'u', '',
        "'", ' ', '-PRON-'
    ])

    lemmas = []

    doc = nlp(text)

    for token in doc:
        lemmas.append(token.lemma_)

    lemma_summary = []

    working_set = ""
    for lemma in lemmas:
        working_set += lemma + ' '
    lemma_summary.append(working_set)

    description = [lemma_summary[0]]

    tokens = []

    for doc in tokenizer.pipe(description, batch_size=500):

        doc_tokens = []

        for token in doc:
            if ((token.is_stop == False) and
                (token.is_punct == False)) and (token.pos_ != 'PRON'):
                if token.text.lower() not in STOP_WORDS:
                    doc_tokens.append(token.text.lower())

        tokens.append(doc_tokens)

    token_summary = []

    for set_of_tokens in tokens:
        working_set = ""
        for variable in set_of_tokens:
            working_set += variable + ' '
        token_summary.append(working_set)

    return token_summary[0]
Esempio n. 14
0
def test_tokenizer_flush_specials(en_vocab):
    suffix_re = re.compile(r"[\.]$")
    rules = {"a a": [{"ORTH": "a a"}]}
    tokenizer1 = Tokenizer(
        en_vocab,
        suffix_search=suffix_re.search,
        rules=rules,
    )
    assert [t.text for t in tokenizer1("a a.")] == ["a a", "."]
    tokenizer1.rules = {}
    assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."]
Esempio n. 15
0
    def __init__(self, whitespace_tokenizer_for_coref=True, whitespace_tokenizer_for_tokenizer=False):
        # for tokenization
        self.tokenize_nlp = spacy.load('en')
        if whitespace_tokenizer_for_tokenizer:
            self.tokenize_nlp.tokenizer = Tokenizer(self.tokenize_nlp.vocab)

        # for coreference resolution
        self.whitespace_tokenizer_for_coref = whitespace_tokenizer_for_coref
        self.coref_nlp = spacy.load('en')
        if whitespace_tokenizer_for_coref:
            self.coref_nlp.tokenizer = Tokenizer(self.coref_nlp.vocab)
Esempio n. 16
0
def test_tokenizer_initial_special_case_explain(en_vocab):
    tokenizer = Tokenizer(
        en_vocab,
        token_match=re.compile("^id$").match,
        rules={
            "id": [{"ORTH": "i"}, {"ORTH": "d"}],
        },
    )
    tokens = [t.text for t in tokenizer("id")]
    explain_tokens = [t[1] for t in tokenizer.explain("id")]
    assert tokens == explain_tokens
Esempio n. 17
0
def test_tokenizer_explain_special_matcher(en_vocab):
    suffix_re = re.compile(r"[\.]$")
    infix_re = re.compile(r"[/]")
    rules = {"a.": [{"ORTH": "a."}]}
    tokenizer = Tokenizer(
        en_vocab,
        rules=rules,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
    )
    tokens = [t.text for t in tokenizer("a/a.")]
    explain_tokens = [t[1] for t in tokenizer.explain("a/a.")]
    assert tokens == explain_tokens
Esempio n. 18
0
def test_spacy_tokenizer_pipe(nlp):
    tokenizer = Tokenizer(nlp.vocab)

    token_sets = []
    for doc in tokenizer.pipe(DOCUMENTS, batch_size=2):
        doc_tokens = [token.text for token in doc]
        token_sets.append(doc_tokens)

    assert token_sets == [['all', 'the', 'kings', 'men'],
                          ['ate', 'all', 'the', 'kings', 'hens'],
                          [
                              'until', 'they', 'all', 'got', 'tired', 'and',
                              'went', 'to', 'sleep', 'zzz'
                          ]]
Esempio n. 19
0
def test_tokenizer_infix_prefix(en_vocab):
    # the prefix and suffix matches overlap in the suffix lookbehind
    infixes = ["±"]
    suffixes = ["%"]
    infix_re = compile_infix_regex(infixes)
    suffix_re = compile_suffix_regex(suffixes)
    tokenizer = Tokenizer(
        en_vocab,
        infix_finditer=infix_re.finditer,
        suffix_search=suffix_re.search,
    )
    tokens = [t.text for t in tokenizer("±10%")]
    assert tokens == ["±10", "%"]
    explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
    assert tokens == explain_tokens
Esempio n. 20
0
def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
    # the prefix and suffix matches overlap in the suffix lookbehind
    prefixes = ["a(?=.)"]
    suffixes = [r"(?<=\w)\.", r"(?<=a)\d+\."]
    prefix_re = compile_prefix_regex(prefixes)
    suffix_re = compile_suffix_regex(suffixes)
    tokenizer = Tokenizer(
        en_vocab,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
    )
    tokens = [t.text for t in tokenizer("a10.")]
    assert tokens == ["a", "10", "."]
    explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
    assert tokens == explain_tokens
Esempio n. 21
0
    def tokenize(df_in):
        """
        Tokenize by inputting a dataframe. Outputs a tokenized list.
        :param df_in:
        :return:
        """
        nlp = spacy.load("en_core_web_lg")

        # Tokenizer
        tokenizer = Tokenizer(nlp.vocab)
        tokens = []
        for doc in tokenizer.pipe(df_in, batch_size=500):
            doc_tokens = [token.text for token in doc]
            tokens.append(doc_tokens)
        return tokens
Esempio n. 22
0
    def __init__(self):
        """ coppied from notebook at app/ml/Build_week_IsaacGrove.ipynb
        """
        self.PICKLE_PATH = path.join(path.dirname(__file__), '..', 'pickles',
                                     '')

        # for now i'm loading data from a static link, will try to pull live data
        # in future iters
        leafly = pd.read_csv(
            'https://raw.githubusercontent.com/Build-Week-Med-Cabinet-6/DS/mark-dev/data/cannabis.csv'
        )

        # Set up spacy tokenizer
        nlp = English()
        tokenizer = Tokenizer(nlp.vocab)

        # work around for pickle
        self.nlp = nlp

        # clean some missing info
        leafly.replace('None', np.NaN, inplace=True)
        leafly = leafly.dropna()

        # Make tokens out of descriptions
        tokens = []
        for desc in tokenizer.pipe(leafly['Description'], batch_size=500):
            desc_tokens = [token.text for token in desc]
            tokens.append(desc_tokens)
        leafly['tokens'] = tokens
        leafly['tokens'].head()

        # Instantiate vectorizer object
        tfidf = TfidfVectorizer(ngram_range=(1, 2),
                                max_df=.7,
                                min_df=.001,
                                tokenizer=self.tokenize)

        # Create a vocabulary and get word counts per listing
        dtm = tfidf.fit_transform(leafly['Description'])

        # Get feature names to use a dataframe column headers
        dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
        # Fit on dtm
        nn = NearestNeighbors(n_neighbors=20, algorithm='kd_tree')
        nn.fit(dtm)
        self.model = nn
        self.transform = tfidf
        return
Esempio n. 23
0
    def __init__(self, language):
        self.nlp = load_spacy(language)

        custom_infixes = ['/']
        all_prefixes_re = spacy.util.compile_prefix_regex(tuple(list(self.nlp.Defaults.prefixes) + custom_infixes))

        infix_re = spacy.util.compile_infix_regex(tuple(list(self.nlp.Defaults.infixes) + custom_infixes))

        suffix_re = spacy.util.compile_suffix_regex(tuple(list(self.nlp.Defaults.suffixes) + custom_infixes))   

        self.nlp.tokenizer = Tokenizer(self.nlp.vocab, self.nlp.Defaults.tokenizer_exceptions,
                        prefix_search = all_prefixes_re.search, 
                        infix_finditer = infix_re.finditer, 
                        suffix_search = suffix_re.search,
                        token_match=None)
        
        self.matcher = Matcher(self.nlp.vocab)

        self.matcher.add('morphology', None, [
             {"TEXT": {"REGEX" : r'^\d\d\d\d$'}},
             {"TEXT": {"REGEX" : r'\s'}, "OP" : "*"},
             {"TEXT": '/'},
             {"TEXT": {"REGEX" : r'\s'}, "OP" : "*"},
             {"TEXT": {"REGEX" : r'\d'}},
        ])
Esempio n. 24
0
def clean_mag_data(dataframe, save_path):

    samples = []

    # prepare tokenization functions
    nlp = spacy.load("en_core_web_lg")
    tokenizer = Tokenizer(nlp.vocab)

    #take samples with at least 10 words in citation context
    for index, row in dataframe.iterrows():
        context = row['context']
        text = re.sub("[" + re.escape(string.punctuation) + "]", " ", context)
        text = [
            token.lemma_ for token in tokenizer(text) if not token.like_num
        ]
        text = [token for token in text if token.strip()]
        if (len(text) < MIN_CONTEXT_LENGTH):
            continue
        # generate sample in correct format

#"paper_id": row['paperid'],
        sample = {
            "context": context,
            "authors_citing": row['citingauthors'],
            "title_cited": row['citedtitle'],
            "authors_cited": row['citedauthors']
        }
        samples.append(pd.DataFrame(sample, index=[0]))

    logger.info("mag samples ready to load to file...")

    dataset = pd.concat(samples, axis=0)
    dataset.to_csv(save_path, compression=None, index=False, index_label=False)
Esempio n. 25
0
 def __init__(self, rollout_num, vocab):
     #self.new_net = copy.deepcopy(net)
     self.vocab = vocab
     self.tokenizer = Tokenizer(
         Vocab(strings=list(vocab.labelToIdx.keys())))
     self.rollout_num = rollout_num
     self.parser = StanfordParser(annots='tokenize')
Esempio n. 26
0
 def transform(self, data):
     tokenizer = Tokenizer(nlp.vocab)
     return np.array(
         [
             np.mean(
                 [
                     self.model[w.text.lower()]
                     for w in words
                     if w.text.lower() in self.model
                 ]
                 or [np.zeros(self.dim)],
                 axis=0,
             )
             for words in tokenizer.pipe(data)
         ]
     )
Esempio n. 27
0
def search_func(user_input, num_results=10):
    """
    Flexible function that searches for cannabis strains.

    ### Request Body
    - user_input str
    - num_results int: default 10
    ### Response
    - `strain_recommendation`: dictionary of strain recommendations
    """

    user_input = [user_input]
    nlp = English()
    tokenizer = Tokenizer(nlp.vocab)
    tf = TfidfVectorizer(stop_words='english')
    dtm = tf.fit_transform(df['search'])
    dtm = pd.DataFrame(dtm.todense(), columns=tf.get_feature_names())

    nr = num_results
    nn = NearestNeighbors(n_neighbors=nr, algorithm='ball_tree')
    nn.fit(dtm)
    dtf = tf.transform(user_input)
    _, output = nn.kneighbors(dtf.todense())

    recommendations = []
    for n in output:
        for row in n:
            recommendations.append(row)

    result = []
    for i in recommendations:
        data = (df.loc[i, :])
        result.append(data)
    return {'strain_recommendations': result}
Esempio n. 28
0
def load_data(data_path: str,
              tokenize: bool = False,
              tokenizer_type: str = "just_spaces") -> List[str]:
    if tokenizer_type == "just_spaces":
        tokenizer = SpacyWordSplitter()
    elif tokenizer_type == "spacy":
        nlp = spacy.load('en')
        tokenizer = Tokenizer(nlp.vocab)
    tokenized_examples = []
    with tqdm(open(data_path, "r"), desc=f"loading {data_path}") as f:
        for line in f:
            if data_path.endswith(".jsonl") or data_path.endswith(".json"):
                example = json.loads(line)
            else:
                example = {"text": line}
            if tokenize:
                if tokenizer_type == 'just_spaces':
                    tokens = list(
                        map(str, tokenizer.split_words(example['text'])))
                elif tokenizer_type == 'spacy':
                    tokens = list(map(str, tokenizer(example['text'])))
                text = ' '.join(tokens)
            else:
                text = example['text']
            tokenized_examples.append(text)
    return tokenized_examples
Esempio n. 29
0
 def _create_tokenizer(nlp):
     infix_re = spacy.util.compile_infix_regex(
         TOKENIZER_INFIXES + [  # u'\w*[,-.–_—:;\(\)\[\]\{\}/]{1,3}\S\w*',
             # r'\w*[,\-.\-_:;\(\)\[\]\{\}\/]{1,3}\S\w*',
             # r'((?P<start_with_non_whitespace_and_one_or_more_punctation>\b\S+|[,.-_:;\(\)\[\]\{\}/\+])(
             # ?P<has_1_or_more_punctation>[,.-_:;\(\)\[\]\{\}/\+])+(
             # ?P<ends_with_non_whitespace_or_non_terminating_punctation>\S+\b[,.-_:;\(\)\[\]\{\}/\+]|[,.-_:;\(\)\[
             # \]\{\}/\+|\-]|\S+\b))',
             # r'\w*\S-\S*\w',
             # u'\w*\S–\S*\w',
             # u'\w*\S—\S*\w',
             # u'\w*[,-.–_—:;\(\)\[\]\{\}/]{1,3}\S\w*'
             ur'(?P<start_with_non_whitespace_and_one_or_more_punctation>\b\S*|[,.-_-:–;—\(\[\{/\+]?)('
             ur'?P<has_1_or_more_punctation>[,.-_-:–;—\(\)\[\]\{\}/\+])+('
             ur'?P<ends_with_non_whitespace_or_non_terminating_punctation>\S+\b[,.-_-:–;—\)\]\}/\+]|[,'
             ur'.-_-:–;—\)\]\}/\+}]|\S+\b)'
         ])
     # TODO: prefix and suffix raise TypeError: '_regex.Pattern' object is not callable
     # prefix_boundaries_to_keep =  ur'\) \] \} \> , . - _ - : – ; — \+ -'.split()
     # suffix_boundaries_to_keep = ur'\( \[ \{ \< , . - _ - : – ; — \+ -'.split()
     # prefixe_re = spacy.util.compile_prefix_regex([i for i in TOKENIZER_PREFIXES if i not in
     # prefix_boundaries_to_keep])
     # suffixe_re = spacy.util.compile_suffix_regex([i for i in TOKENIZER_SUFFIXES if i not in
     # suffix_boundaries_to_keep])
     #
     # return Tokenizer(nlp.vocab, {}, prefixe_re.search, suffixe_re.search,
     #                  infix_re.finditer)
     return Tokenizer(nlp.vocab, {}, nlp.tokenizer.prefix_search,
                      nlp.tokenizer.suffix_search, infix_re.finditer)
Esempio n. 30
0
    def get_spacy_tokens(self, spacy_sentence, target_word, spacy_model):
        """
        A function to locate the target phrase spacy tokens in a spacy doc of
        a whole sentence.

        Args:
            spacy_sentence: spacy doc for the context sentence
            spacy_target_word: spacy doc for the target word/phrase only

        Returns:
            spacy_token_list: a list of the spacy tokens for the target phrase,
                              using the information from the context sentence.

        """
        # Create the tokeniser
        tokenizer = Tokenizer(spacy_model.vocab)

        spacy_token_list = []

        for target in tokenizer(target_word):
            for wd in spacy_sentence:
                if target.text == wd.text:
                    spacy_token_list.append(wd)
                    break

        return spacy_token_list
Esempio n. 31
0
    def test_create(self):
        vocab = Vocab()
        dummy_re = re.compile(r'sklfb;s')
        tokenizer = Tokenizer(vocab, {}, dummy_re, dummy_re, dummy_re)
        doc = tokenizer(u'I am a document.')

        self.assertEqual(len(doc), 4)
Esempio n. 32
0
    def custom_tokenizer(self):
        """ Function that defines a tokenizer in order to be used
        
        Parameters
        -----------
        nlp:  spacy loaded object
        return: prepared tokenizer
        """

        infixes = (
            LIST_ELLIPSES + LIST_ICONS + [
                r"(?<=[0-9])[+\-\*^](?=[0-9-])",
                r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                    al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
                r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
                #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
                r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
            ])

        infix_re = compile_infix_regex(infixes)

        return Tokenizer(self.nlp.vocab,
                         prefix_search=self.nlp.tokenizer.prefix_search,
                         suffix_search=self.nlp.tokenizer.suffix_search,
                         infix_finditer=infix_re.finditer,
                         token_match=self.nlp.tokenizer.token_match,
                         rules=self.nlp.Defaults.tokenizer_exceptions)
Esempio n. 33
0
def main(output_dir):
    ensure_dir(output_dir)
    ensure_dir(output_dir, "pos")
    ensure_dir(output_dir, "vocab")
    
    vocab = Vocab(tag_map=TAG_MAP)
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    # The default_templates argument is where features are specified. See
    # spacy/tagger.pyx for the defaults.
    tagger = Tagger.blank(vocab, Tagger.default_templates())

    for i in range(5):
        for words, tags in DATA:
            tokens = tokenizer.tokens_from_list(words)
            tagger.train(tokens, tags)
        random.shuffle(DATA)
    tagger.model.end_training(path.join(output_dir, 'pos', 'model'))
    vocab.strings.dump(path.join(output_dir, 'vocab', 'strings.txt'))
Esempio n. 34
0
def count_freqs(input_loc, output_loc):
    print(output_loc)
    vocab = English.default_vocab(get_lex_attr=None)
    tokenizer = Tokenizer.from_dir(vocab,
                    path.join(English.default_data_dir(), 'tokenizer'))

    counts = PreshCounter()
    for json_comment in iter_comments(input_loc):
        doc = tokenizer(json_comment['body'])
        doc.count_by(ORTH, counts=counts)

    with io.open(output_loc, 'w', 'utf8') as file_:
        for orth, freq in counts:
            string = tokenizer.vocab.strings[orth]
            if not string.isspace():
                file_.write('%d\t%s\n' % (freq, string))
Esempio n. 35
0
def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
    """Test that custom tokenizer with not all functions defined can be
    serialized and deserialized correctly (see #2494)."""
    tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search)
    tokenizer_bytes = tokenizer.to_bytes()
    Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
Esempio n. 36
0
 def test_load(self):
     data_dir = English.default_data_dir()
     vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
     tokenizer = Tokenizer.from_dir(vocab, path.join(data_dir, 'tokenizer'))