def read_twits(inp, output):
    # read tweet ids from <inp>, write tweets (json format)
    # to folder <output>.
    text = read_data(inp)
    was = False
    for line in text:
        idshka, lang = line.split(';')
        while True:
            try:
                limits = tw.GetRateLimitStatus('statuses')[u'resources'][u'statuses'][u'/statuses/show/:id']
                if limits[u'remaining'] <= 10:
                    time.sleep(10)
                else:
                    break
            except:
                time.sleep(10)
        try:
            f = open(output + lang + idshka, 'r')
            f.close()
        except IOError:
            try:
                s = get_one(int(idshka))
                write_data(output + lang + idshka, s.AsJsonString())
            except:
                print 'Twit ' + idshka + ' is hidden now'
def russian_get_text(inp, output):
    # parse tweets from .csv file by Julia Rubtsova
    # from 'Метод построения и анализа корпуса коротких текстов для задачи классификации отзывов'
    data = read_data(inp)

    res = []
    pattern = '''"(.*?)";'''
    for line in data:
        tokens = nltk.regexp_tokenize(line, pattern)
        if len(tokens) < 4:
            continue

        text = tokens[3][1:-2]
        mentions = get_mentions(text)
        links = get_links(text)
        hashtags = get_hashtags(text)
        text = process(text)
        sname = tokens[2][1:-2]
        if text == '':
            continue

        row = [text, 'not-given', sname, 'not-given', ','.join(hashtags), ','.join(mentions), ','.join(links)]
        row = '\t'.join(row)

        res.append(row)

    write_data(output, res)
def bergsma_get_text(inp, output):
    # extract text with features from tweet in format provided by
    # Shane Bergsma from paper 
    # 'Language Identification for Creating Language-Specific Twitter Collections'.
    data = read_data(inp)

    res = []
    for line in data:
        tokens = line.split('\t')
        if len(tokens) < 3:
            continue

        text = tokens[5]
        mentions = get_mentions(text)
        links = get_links(text)
        hashtags = get_hashtags(text)
        text = process(text)
        sname = tokens[1]
        uname = tokens[2]
        location = tokens[3]
        row = [text, uname, sname, location, ','.join(hashtags), ','.join(mentions), ','.join(links)]
        row = '\t'.join(row)
        if text == '':
            continue
        
        res.append(row)
    
    write_data(output, res)
def bergsma_get_text(inp, output):
    # extract plain text from tweet in format provided by
    # Shane Bergsma from paper 
    # 'Language Identification for Creating Language-Specific Twitter Collections'.
    data = read_data(inp)

    res = []
    for line in data:
        tokens = line.split('\t')
        if len(tokens) < 3:
            continue
        res.append(tokens[5])
    
    sep = '\n' + '-' * 60 + '\n' * 4
    write_data(output, res, sep)
def russian_get_text(inp, output):
    # parse tweets from .csv file by Julia Rubtsova
    # from 'Метод построения и анализа корпуса коротких текстов для задачи классификации отзывов'
    data = read_data(inp)

    res = []
    pattern = '''"(.*?)";'''
    for line in data:
        tokens = nltk.regexp_tokenize(line, pattern)
        if len(tokens) < 4:
            continue
        res.append(tokens[3][1:-2])

    sep = '\n' + '-' * 60 + '\n' * 4
    write_data(output, res, sep)
from string_processing import *
import os, os.path
import sys
import codecs
from scripts import read_data, write_data

#######################################################
# get all .txt files from $1, parse tweets from them
# (should be separated by 'separator') and write to $2
# with same file name
folder = sys.argv[1]
separator = "\n" + "-" * 60 + "\n" * 4
if not os.path.isdir(sys.argv[2]):
    os.mkdir(sys.argv[2])

for fi in os.listdir(folder):
    if fi.endswith(".txt"):
        texts = read_data(os.path.join(folder, fi), separator)
        parsed = []
        for text in texts:
            text = process(text)
            if text != "":
                parsed.append(text)
        write_data(os.path.join(sys.argv[2], fi), parsed)