def bergsma_get_text(inp, output):
    # extract text with features from tweet in format provided by
    # Shane Bergsma from paper 
    # 'Language Identification for Creating Language-Specific Twitter Collections'.
    data = read_data(inp)

    res = []
    for line in data:
        tokens = line.split('\t')
        if len(tokens) < 3:
            continue

        text = tokens[5]
        mentions = get_mentions(text)
        links = get_links(text)
        hashtags = get_hashtags(text)
        text = process(text)
        sname = tokens[1]
        uname = tokens[2]
        location = tokens[3]
        row = [text, uname, sname, location, ','.join(hashtags), ','.join(mentions), ','.join(links)]
        row = '\t'.join(row)
        if text == '':
            continue
        
        res.append(row)
    
    write_data(output, res)
def read_twits(inp, output):
    # read tweet ids from <inp>, write tweets (json format)
    # to folder <output>.
    text = read_data(inp)
    was = False
    for line in text:
        idshka, lang = line.split(';')
        while True:
            try:
                limits = tw.GetRateLimitStatus('statuses')[u'resources'][u'statuses'][u'/statuses/show/:id']
                if limits[u'remaining'] <= 10:
                    time.sleep(10)
                else:
                    break
            except:
                time.sleep(10)
        try:
            f = open(output + lang + idshka, 'r')
            f.close()
        except IOError:
            try:
                s = get_one(int(idshka))
                write_data(output + lang + idshka, s.AsJsonString())
            except:
                print 'Twit ' + idshka + ' is hidden now'
def russian_get_text(inp, output):
    # parse tweets from .csv file by Julia Rubtsova
    # from 'Метод построения и анализа корпуса коротких текстов для задачи классификации отзывов'
    data = read_data(inp)

    res = []
    pattern = '''"(.*?)";'''
    for line in data:
        tokens = nltk.regexp_tokenize(line, pattern)
        if len(tokens) < 4:
            continue

        text = tokens[3][1:-2]
        mentions = get_mentions(text)
        links = get_links(text)
        hashtags = get_hashtags(text)
        text = process(text)
        sname = tokens[2][1:-2]
        if text == '':
            continue

        row = [text, 'not-given', sname, 'not-given', ','.join(hashtags), ','.join(mentions), ','.join(links)]
        row = '\t'.join(row)

        res.append(row)

    write_data(output, res)
def plain_get_text(folder, output, prefix=''):
    # writes plain text from every file in <folder> starting with 
    # <prefix> to <output>. 
    files = filter(lambda x: x.startswith(prefix), os.listdir(folder))
    separator = '\n' + '-' * 60 + '\n' * 4
    out_data = []
    for fi in files:
        fi = codecs.open(os.path.join(folder, fi), 'r', 'utf-8')
        out = fi.read()
        out_data.append(out)
        fi.close()

    write_data(output, out_data, separator)
def bergsma_get_text(inp, output):
    # extract plain text from tweet in format provided by
    # Shane Bergsma from paper 
    # 'Language Identification for Creating Language-Specific Twitter Collections'.
    data = read_data(inp)

    res = []
    for line in data:
        tokens = line.split('\t')
        if len(tokens) < 3:
            continue
        res.append(tokens[5])
    
    sep = '\n' + '-' * 60 + '\n' * 4
    write_data(output, res, sep)
def russian_get_text(inp, output):
    # parse tweets from .csv file by Julia Rubtsova
    # from 'Метод построения и анализа корпуса коротких текстов для задачи классификации отзывов'
    data = read_data(inp)

    res = []
    pattern = '''"(.*?)";'''
    for line in data:
        tokens = nltk.regexp_tokenize(line, pattern)
        if len(tokens) < 4:
            continue
        res.append(tokens[3][1:-2])

    sep = '\n' + '-' * 60 + '\n' * 4
    write_data(output, res, sep)
def plain_get_text(folder, output, prefix=''):
    # writes text with features from every file in <folder> starting with 
    # <prefix> to <output>. 
    files = filter(lambda x: x.startswith(prefix), os.listdir(folder))
    separator = '\n' + '-' * 60 + '\n' * 4
    out_data = []
    for fi in files:
        fi = codecs.open(os.path.join(folder, fi), 'r', 'utf-8')
        out = fi.read()
        fi.close()

        out = process(out)
        if out == '':
            continue
        out = [out] + ['not-given', 'not-given', 'not-given', '', '', '']
        out = '\t'.join(out)

        out_data.append(out)

    write_data(output, out_data)
def json_get_text(folder, output, prefix=''):
    # writes json text from every file in <folder> starting with 
    # <prefix> to <output>. 
    files = filter(lambda x: x.startswith(prefix), os.listdir(folder))
    out_data = []
    for fi in files:
        fi = codecs.open(os.path.join(folder, fi), 'r', 'utf-8')

        out = json.load(fi)
        
        text = out[u'text']
        mentions = get_mentions(text)
        links = get_links(text)
        hashtags = get_hashtags(text)
        text = process(text)
        if text == '':
            continue 

        sname = process(out[u'user'][u'screen_name'])

        try:
            uname = process(out[u'user'][u'name'])
        except KeyError:
            uname = 'not-given'

        try:
            location = process(out[u'user'][u'location'])
        except KeyError:
            location = 'not-given'

        row = [text, uname, sname, location, ','.join(hashtags), ','.join(mentions), ','.join(links)]
        row = '\t'.join(row)

        out_data.append(row)
        fi.close()
    
    write_data(output, out_data)
from string_processing import *
import os, os.path
import sys
import codecs
from scripts import read_data, write_data

#######################################################
# get all .txt files from $1, parse tweets from them
# (should be separated by 'separator') and write to $2
# with same file name
folder = sys.argv[1]
separator = "\n" + "-" * 60 + "\n" * 4
if not os.path.isdir(sys.argv[2]):
    os.mkdir(sys.argv[2])

for fi in os.listdir(folder):
    if fi.endswith(".txt"):
        texts = read_data(os.path.join(folder, fi), separator)
        parsed = []
        for text in texts:
            text = process(text)
            if text != "":
                parsed.append(text)
        write_data(os.path.join(sys.argv[2], fi), parsed)