def Texts2SVM_Feature(texts, **kwargs): #, tfidf, stopwords_list, SentiWords): ''' transfer list of text to list of features for SVM, return list of features ''' tfidf = kwargs['tfidf'] stopwords_list = kwargs['stopwords'] SentiWords = kwargs['SentiWords'] # list of list of words: (#texts, #words) # start = time.time() texts = list(map(lambda text: simple_tokenize(text), texts)) # print 'tokenize time = ', time.time() - start # filter useless stopwords and stemming # list of [#noun, #adj, #adv, #verb, #url, #hashtag, #mentions, #number, #cap, #strong_neg, #strong_pos, #weak_neg, #weak_pos] other_values = np.array( list( map( lambda text: svm_text_feature(text, stopwords_list, SentiWords ), texts))) # get tfidf value # texts = list(map(lambda text: ' '.join(text), texts)) # tfidf_values = tfidf.fit_transform(texts) # print tfidf_values.shape # all_features = hstack([tfidf_values, other_values]) # print all_features.shape # return all_features return other_values
def Texts2Matrix(texts, model, max_len): ''' preprocess tweets by tokenize, embedding and padding, return list of word2vector matrix ''' # tokenization and replace URL, NUMBERs and MENTION with special tokens # start = time.time() texts = list(map(lambda t: simple_tokenize(t), texts)) # list: (#tweets, list_of_words) # print 'tokenize time = ', time.time() - start # embedding # start = time.time() embeddings = list( map(lambda tweet: np.array(list(map(lambda w: w2v(w, model), tweet))), texts)) # list: (#tweets, np.array(#words, #dim)) # print 'embedding time = ', time.time() - start # padding # start = time.time() if max_len == 0: max_len = max(list(map(lambda t: len(t), texts))) paddings = np.array(list(map(lambda x: padding_2D(x, max_len), embeddings))) # print 'padding2D time = ', time.time() - start return texts, paddings
def Texts2Index(texts, vocab, max_len): ''' transfer list of text to list of indexes by looking up the vocab, return list of indexes ''' # list of list of words: (#texts, #words) # start = time.time() texts = list(map(lambda text: simple_tokenize(text), texts)) # print 'tokenize time = ', time.time() - start # list of list of indexs: (#texts, #idxes) # start = time.time() idxes = list( map( lambda words: list(map(lambda word: word2Index(word, vocab), words) ), texts)) # print 'word2index time = ', time.time() - start # padding # start = time.time() if max_len == 0: max_len = max(list(map(lambda t: len(t), texts))) idxes = list(map(lambda idx: padding_1D(idx, max_len), idxes)) # print 'padding1D time = ', time.time() - start return np.array(idxes)
def tokenize_and_clean(msg, alignments): if alignments: toks = twokenize.tokenize(msg) else: toks = twokenize.simple_tokenize(msg) for i in range(len(toks)): toks[i] = toks[i].lower() inds = range(len(toks)) #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds)))) if alignments: return toks.subset(inds) else: return [toks[i] for i in inds]
def build_vocab(filename, min_freq=5): ''' build vocab from texts in filename, with minimum frequency (5 by default) ''' if not os.path.exists(filename): print 'file %s does not exist. Please correct the name and try again.' % filename parser = lambda date: pd.datetime.strptime(date[:20] + date[24:], '%c') columns = ['text'] df = pd.read_csv(filename, names=columns, usecols=[5]) # tweet id as index df = df['text'].tolist() # texts to words, words: list of list of words words = list(map(lambda tweet: simple_tokenize(tweet), df)) # counting words to get vocab with (word: freq) freq_vocab = word2FreqVocab(words) # transfer freq_vocab to index vocab with (word: index) vocab = freq2IndexVocab(freq_vocab, min_freq) with open('vocab.pkl', 'w') as f: cPickle.dump(vocab, f) return vocab
import sys sys.path.insert(0, '/usr2/corpora/tweets/tweetmotif') import twokenize, util, bigrams util.fix_stdio() from sane_re import * AposFix = _R(r"( |^)(' [stm])( |$)") for line in sys.stdin: parts = util.unicodify(line[:-1]).split("\t") text = parts[-1] toks = twokenize.simple_tokenize(text) toked = " ".join(toks) #print "\t".join(parts[:-1]) + "\t" + toked #try: AposFix.show_match(toked) #except: pass featstr = AposFix.gsub(toked, lambda m: m[1] + m[2].replace(" ", "") + m[3]) featstr = featstr.lower() toks = featstr.split() feats = [ug[0] for ug in bigrams.filtered_unigrams(toks)] feats += ["_".join(ng) for ng in bigrams.filtered_bigrams(toks)] print "\t".join(parts[:-1]) + "\t" + util.unicodify(" ".join(feats))
def TokenizeTweet(tweet): vec = [item.lower() for item in twokenize.simple_tokenize(tweet)] outputvec = [GetTokenId(item) for item in vec if not item.startswith('http') and not item.startswith('@') and item not in stopwords and item != '#tcot' and item != '#p2' and len(item) > 1] return outputvec
import sys sys.path.insert(0,'/usr2/corpora/tweets/tweetmotif') import twokenize,util,bigrams util.fix_stdio() from sane_re import * AposFix = _R(r"( |^)(' [stm])( |$)") for line in sys.stdin: parts = util.unicodify(line[:-1]).split("\t") text = parts[-1] toks = twokenize.simple_tokenize(text) toked = " ".join(toks) #print "\t".join(parts[:-1]) + "\t" + toked #try: AposFix.show_match(toked) #except: pass featstr = AposFix.gsub(toked, lambda m: m[1]+m[2].replace(" ","")+m[3]) featstr = featstr.lower() toks = featstr.split() feats = [ug[0] for ug in bigrams.filtered_unigrams(toks)] feats += ["_".join(ng) for ng in bigrams.filtered_bigrams(toks)] print "\t".join(parts[:-1]) + "\t" + util.unicodify(" ".join(feats))
import twokenize import codecs ipf = codecs.open("deliverable2input.txt", "rU", "utf-8") stopf = codecs.open("stopwords.txt", "rU", "utf-8") opf = codecs.open("deliverable2output.txt", "w", "utf-8") stopwords = [line.lower() for line in stopf] for tweet in ipf: opf.write(tweet) if tweet[-1] != "\n": opf.write("\n") vec = [item.lower() for item in twokenize.simple_tokenize(tweet)] outputvec = [ item for item in vec if not item.startswith("http") and not item.startswith("@") and item not in stopwords and item != "#tcot" and item != "#p2" and len(item) > 1 ] opf.write(u" ".join(outputvec) + u"\n") for i in range(80): opf.write("-") opf.write("\n") opf.close() ipf.close()