def bergsma_get_text(inp, output): # extract text with features from tweet in format provided by # Shane Bergsma from paper # 'Language Identification for Creating Language-Specific Twitter Collections'. data = read_data(inp) res = [] for line in data: tokens = line.split('\t') if len(tokens) < 3: continue text = tokens[5] mentions = get_mentions(text) links = get_links(text) hashtags = get_hashtags(text) text = process(text) sname = tokens[1] uname = tokens[2] location = tokens[3] row = [text, uname, sname, location, ','.join(hashtags), ','.join(mentions), ','.join(links)] row = '\t'.join(row) if text == '': continue res.append(row) write_data(output, res)
def read_twits(inp, output): # read tweet ids from <inp>, write tweets (json format) # to folder <output>. text = read_data(inp) was = False for line in text: idshka, lang = line.split(';') while True: try: limits = tw.GetRateLimitStatus('statuses')[u'resources'][u'statuses'][u'/statuses/show/:id'] if limits[u'remaining'] <= 10: time.sleep(10) else: break except: time.sleep(10) try: f = open(output + lang + idshka, 'r') f.close() except IOError: try: s = get_one(int(idshka)) write_data(output + lang + idshka, s.AsJsonString()) except: print 'Twit ' + idshka + ' is hidden now'
def russian_get_text(inp, output): # parse tweets from .csv file by Julia Rubtsova # from 'Метод построения и анализа корпуса коротких текстов для задачи классификации отзывов' data = read_data(inp) res = [] pattern = '''"(.*?)";''' for line in data: tokens = nltk.regexp_tokenize(line, pattern) if len(tokens) < 4: continue text = tokens[3][1:-2] mentions = get_mentions(text) links = get_links(text) hashtags = get_hashtags(text) text = process(text) sname = tokens[2][1:-2] if text == '': continue row = [text, 'not-given', sname, 'not-given', ','.join(hashtags), ','.join(mentions), ','.join(links)] row = '\t'.join(row) res.append(row) write_data(output, res)
def plain_get_text(folder, output, prefix=''): # writes plain text from every file in <folder> starting with # <prefix> to <output>. files = filter(lambda x: x.startswith(prefix), os.listdir(folder)) separator = '\n' + '-' * 60 + '\n' * 4 out_data = [] for fi in files: fi = codecs.open(os.path.join(folder, fi), 'r', 'utf-8') out = fi.read() out_data.append(out) fi.close() write_data(output, out_data, separator)
def bergsma_get_text(inp, output): # extract plain text from tweet in format provided by # Shane Bergsma from paper # 'Language Identification for Creating Language-Specific Twitter Collections'. data = read_data(inp) res = [] for line in data: tokens = line.split('\t') if len(tokens) < 3: continue res.append(tokens[5]) sep = '\n' + '-' * 60 + '\n' * 4 write_data(output, res, sep)
def russian_get_text(inp, output): # parse tweets from .csv file by Julia Rubtsova # from 'Метод построения и анализа корпуса коротких текстов для задачи классификации отзывов' data = read_data(inp) res = [] pattern = '''"(.*?)";''' for line in data: tokens = nltk.regexp_tokenize(line, pattern) if len(tokens) < 4: continue res.append(tokens[3][1:-2]) sep = '\n' + '-' * 60 + '\n' * 4 write_data(output, res, sep)
def plain_get_text(folder, output, prefix=''): # writes text with features from every file in <folder> starting with # <prefix> to <output>. files = filter(lambda x: x.startswith(prefix), os.listdir(folder)) separator = '\n' + '-' * 60 + '\n' * 4 out_data = [] for fi in files: fi = codecs.open(os.path.join(folder, fi), 'r', 'utf-8') out = fi.read() fi.close() out = process(out) if out == '': continue out = [out] + ['not-given', 'not-given', 'not-given', '', '', ''] out = '\t'.join(out) out_data.append(out) write_data(output, out_data)
def json_get_text(folder, output, prefix=''): # writes json text from every file in <folder> starting with # <prefix> to <output>. files = filter(lambda x: x.startswith(prefix), os.listdir(folder)) out_data = [] for fi in files: fi = codecs.open(os.path.join(folder, fi), 'r', 'utf-8') out = json.load(fi) text = out[u'text'] mentions = get_mentions(text) links = get_links(text) hashtags = get_hashtags(text) text = process(text) if text == '': continue sname = process(out[u'user'][u'screen_name']) try: uname = process(out[u'user'][u'name']) except KeyError: uname = 'not-given' try: location = process(out[u'user'][u'location']) except KeyError: location = 'not-given' row = [text, uname, sname, location, ','.join(hashtags), ','.join(mentions), ','.join(links)] row = '\t'.join(row) out_data.append(row) fi.close() write_data(output, out_data)
from string_processing import * import os, os.path import sys import codecs from scripts import read_data, write_data ####################################################### # get all .txt files from $1, parse tweets from them # (should be separated by 'separator') and write to $2 # with same file name folder = sys.argv[1] separator = "\n" + "-" * 60 + "\n" * 4 if not os.path.isdir(sys.argv[2]): os.mkdir(sys.argv[2]) for fi in os.listdir(folder): if fi.endswith(".txt"): texts = read_data(os.path.join(folder, fi), separator) parsed = [] for text in texts: text = process(text) if text != "": parsed.append(text) write_data(os.path.join(sys.argv[2], fi), parsed)