def gen_feature(atuple): text = re.sub( r'https?://([a-zA-Z0-9\.\-_]+)[\w\-\._~:/\?#@!\$&\'\*\+,;=%%]*', '\\1', atuple['text'], flags=re.MULTILINE) aset = set() bngram = ngram.get_byte_ngrams(text) for n in bngram['ngram_byte']: for k in bngram['ngram_byte'][n]: aset.add(('nb%d' % n, k)) for n in bngram['ngram_byte_cs']: for k in bngram['ngram_byte_cs'][n]: aset.add(('nbcs%d' % n, k)) wngram = ngram.get_word_ngrams(text) for n in wngram['ngram_word']: for k in wngram['ngram_word'][n]: aset.add(('nw%d' % n, ' '.join(k))) for n in wngram['ngram_word_clean']: for k in wngram['ngram_word_clean'][n]: aset.add(('nwc%d' % n, ' '.join(k))) words, clean_words = ngram.get_words(text) for word in words: aset.add(('w', word)) for word in clean_words: aset.add(('cw', word)) lex = lexical.get_symbol_dist(text) for k in lex['lex']: aset.add(('l', k)) return set(aset)
def gen_feature(atuple): text = re.sub(r'https?://([a-zA-Z0-9\.\-_]+)[\w\-\._~:/\?#@!\$&\'\*\+,;=%%]*', '\\1', atuple['text'], flags=re.MULTILINE) aset = set() bngram = ngram.get_byte_ngrams(text) for n in bngram['ngram_byte']: for k in bngram['ngram_byte'][n]: aset.add(('nb%d' % n, k)) for n in bngram['ngram_byte_cs']: for k in bngram['ngram_byte_cs'][n]: aset.add(('nbcs%d' % n, k)) wngram = ngram.get_word_ngrams(text) for n in wngram['ngram_word']: for k in wngram['ngram_word'][n]: aset.add(('nw%d' % n, ' '.join(k))) for n in wngram['ngram_word_clean']: for k in wngram['ngram_word_clean'][n]: aset.add(('nwc%d' % n, ' '.join(k))) words, clean_words = ngram.get_words(text) for word in words: aset.add(('w', word)) for word in clean_words: aset.add(('cw', word)) lex = lexical.get_symbol_dist(text) for k in lex['lex']: aset.add(('l', k)) return set(aset)
def gen_feature(atuple): text = re.sub(r'https?://([a-zA-Z0-9\.\-_]+)[\w\-\._~:/\?#@!\$&\'\*\+,;=%%]*', '\\1', atuple['text'], flags=re.MULTILINE) aset = set() wngram = ngram.get_word_ngrams(text) for n in wngram['ngram_word']: for k in wngram['ngram_word'][n]: aset.add(('npos%d' % n, ' '.join(k))) words, clean_words = ngram.get_words(text) for word in words: aset.add(('pos', word)) return set(aset)
def gen_feature(atuple): text = re.sub( r'https?://([a-zA-Z0-9\.\-_]+)[\w\-\._~:/\?#@!\$&\'\*\+,;=%%]*', '\\1', atuple['text'], flags=re.MULTILINE) aset = set() wngram = ngram.get_word_ngrams(text) for n in wngram['ngram_word']: for k in wngram['ngram_word'][n]: aset.add(('npos%d' % n, ' '.join(k))) words, clean_words = ngram.get_words(text) for word in words: aset.add(('pos', word)) return set(aset)