Beispiel #1
0
def Texts2SVM_Feature(texts, **kwargs):  #, tfidf, stopwords_list, SentiWords):
    '''
    transfer list of text to list of features for SVM,
    return list of features
    '''
    tfidf = kwargs['tfidf']
    stopwords_list = kwargs['stopwords']
    SentiWords = kwargs['SentiWords']

    # list of list of words: (#texts, #words)
    # start = time.time()
    texts = list(map(lambda text: simple_tokenize(text), texts))
    # print 'tokenize time = ', time.time() - start

    # filter useless stopwords and stemming
    # list of [#noun, #adj, #adv, #verb, #url, #hashtag, #mentions, #number, #cap, #strong_neg, #strong_pos, #weak_neg, #weak_pos]
    other_values = np.array(
        list(
            map(
                lambda text: svm_text_feature(text, stopwords_list, SentiWords
                                              ), texts)))

    # get tfidf value
    # texts = list(map(lambda text: ' '.join(text), texts))
    # tfidf_values = tfidf.fit_transform(texts)
    # print tfidf_values.shape

    # all_features = hstack([tfidf_values, other_values])
    # print all_features.shape
    # return all_features
    return other_values
Beispiel #2
0
def Texts2Matrix(texts, model, max_len):
    '''
        preprocess tweets by tokenize, embedding and padding,
        return list of word2vector matrix
        '''
    # tokenization and replace URL, NUMBERs and MENTION with special tokens
    # start = time.time()
    texts = list(map(lambda t: simple_tokenize(t),
                     texts))  # list: (#tweets, list_of_words)
    # print 'tokenize time = ', time.time() - start

    # embedding
    # start = time.time()
    embeddings = list(
        map(lambda tweet: np.array(list(map(lambda w: w2v(w, model), tweet))),
            texts))  # list: (#tweets, np.array(#words, #dim))
    # print 'embedding time = ', time.time() - start

    # padding
    # start = time.time()
    if max_len == 0:
        max_len = max(list(map(lambda t: len(t), texts)))
    paddings = np.array(list(map(lambda x: padding_2D(x, max_len),
                                 embeddings)))
    # print 'padding2D time = ', time.time() - start

    return texts, paddings
Beispiel #3
0
def Texts2Index(texts, vocab, max_len):
    '''
    transfer list of text to list of indexes by looking up the vocab,
    return list of indexes
    '''
    # list of list of words: (#texts, #words)
    # start = time.time()
    texts = list(map(lambda text: simple_tokenize(text), texts))
    # print 'tokenize time = ', time.time() - start

    # list of list of indexs: (#texts, #idxes)
    # start = time.time()
    idxes = list(
        map(
            lambda words: list(map(lambda word: word2Index(word, vocab), words)
                               ), texts))
    # print 'word2index time = ', time.time() - start

    # padding
    # start = time.time()
    if max_len == 0:
        max_len = max(list(map(lambda t: len(t), texts)))
    idxes = list(map(lambda idx: padding_1D(idx, max_len), idxes))
    # print 'padding1D time = ', time.time() - start
    return np.array(idxes)
def tokenize_and_clean(msg, alignments):
    if alignments:
        toks = twokenize.tokenize(msg)
    else:
        toks = twokenize.simple_tokenize(msg)
    for i in range(len(toks)):
        toks[i] = toks[i].lower()
    inds = range(len(toks))
    #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds))))
    if alignments:
        return toks.subset(inds)
    else:
        return [toks[i] for i in inds]
Beispiel #5
0
def tokenize_and_clean(msg, alignments):
  if alignments: 
    toks = twokenize.tokenize(msg)
  else:          
    toks = twokenize.simple_tokenize(msg)
  for i in range(len(toks)):
    toks[i] = toks[i].lower()
  inds = range(len(toks))
  #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds))))
  if alignments: 
    return toks.subset(inds)
  else:
    return [toks[i] for i in inds]
Beispiel #6
0
def build_vocab(filename, min_freq=5):
    '''
    build vocab from texts in filename, with minimum frequency (5 by default)
    '''
    if not os.path.exists(filename):
        print 'file %s does not exist. Please correct the name and try again.' % filename

    parser = lambda date: pd.datetime.strptime(date[:20] + date[24:], '%c')
    columns = ['text']
    df = pd.read_csv(filename, names=columns, usecols=[5])  # tweet id as index

    df = df['text'].tolist()
    # texts to words, words: list of list of words
    words = list(map(lambda tweet: simple_tokenize(tweet), df))
    # counting words to get vocab with (word: freq)
    freq_vocab = word2FreqVocab(words)
    # transfer freq_vocab to index vocab with (word: index)
    vocab = freq2IndexVocab(freq_vocab, min_freq)
    with open('vocab.pkl', 'w') as f:
        cPickle.dump(vocab, f)

    return vocab
Beispiel #7
0
import sys
sys.path.insert(0, '/usr2/corpora/tweets/tweetmotif')
import twokenize, util, bigrams
util.fix_stdio()
from sane_re import *

AposFix = _R(r"( |^)(' [stm])( |$)")

for line in sys.stdin:
    parts = util.unicodify(line[:-1]).split("\t")
    text = parts[-1]
    toks = twokenize.simple_tokenize(text)
    toked = " ".join(toks)
    #print "\t".join(parts[:-1]) + "\t" + toked
    #try: AposFix.show_match(toked)
    #except: pass
    featstr = AposFix.gsub(toked,
                           lambda m: m[1] + m[2].replace(" ", "") + m[3])
    featstr = featstr.lower()
    toks = featstr.split()
    feats = [ug[0] for ug in bigrams.filtered_unigrams(toks)]
    feats += ["_".join(ng) for ng in bigrams.filtered_bigrams(toks)]

    print "\t".join(parts[:-1]) + "\t" + util.unicodify(" ".join(feats))
def TokenizeTweet(tweet):
  vec = [item.lower() for item in twokenize.simple_tokenize(tweet)]
  outputvec = [GetTokenId(item) for item in vec if not item.startswith('http') and not item.startswith('@') and item not in stopwords and item != '#tcot' and item != '#p2' and len(item) > 1]
  return outputvec
Beispiel #9
0
import sys
sys.path.insert(0,'/usr2/corpora/tweets/tweetmotif')
import twokenize,util,bigrams
util.fix_stdio()
from sane_re import *

AposFix = _R(r"( |^)(' [stm])( |$)")


for line in sys.stdin:
  parts = util.unicodify(line[:-1]).split("\t")
  text = parts[-1]
  toks = twokenize.simple_tokenize(text)
  toked = " ".join(toks)
  #print "\t".join(parts[:-1]) + "\t" + toked
  #try: AposFix.show_match(toked)
  #except: pass
  featstr = AposFix.gsub(toked, lambda m: m[1]+m[2].replace(" ","")+m[3])
  featstr = featstr.lower()
  toks = featstr.split()
  feats = [ug[0] for ug in bigrams.filtered_unigrams(toks)]
  feats += ["_".join(ng) for ng in bigrams.filtered_bigrams(toks)]

  print "\t".join(parts[:-1]) + "\t" + util.unicodify(" ".join(feats))
import twokenize
import codecs

ipf = codecs.open("deliverable2input.txt", "rU", "utf-8")
stopf = codecs.open("stopwords.txt", "rU", "utf-8")
opf = codecs.open("deliverable2output.txt", "w", "utf-8")
stopwords = [line.lower() for line in stopf]

for tweet in ipf:
    opf.write(tweet)
    if tweet[-1] != "\n":
        opf.write("\n")
    vec = [item.lower() for item in twokenize.simple_tokenize(tweet)]
    outputvec = [
        item
        for item in vec
        if not item.startswith("http")
        and not item.startswith("@")
        and item not in stopwords
        and item != "#tcot"
        and item != "#p2"
        and len(item) > 1
    ]
    opf.write(u" ".join(outputvec) + u"\n")
    for i in range(80):
        opf.write("-")
    opf.write("\n")


opf.close()
ipf.close()