Exemple #1
0
def get_new_tasks_list(end_hour=24):
    time_slots = generate_daily_time_slots(end_hour)
    task_list = []

    already_processed_time_slots = get_processed_time_slots_from_checkpoint()

    for slot in time_slots:
        INTERVAL = slot[0][-6:] + "-" + slot[1][-6:]

        # images
        id = IMAGES_DIR.replace('{INTERVAL}', INTERVAL)
        if not hdfs.exists(id):
            hdfs.mkdir(id)

        # texts
        td = TEXTS_DIR.replace('{INTERVAL}', INTERVAL)
        if not hdfs.exists(td):
            hdfs.mkdir(td)

        if slot not in already_processed_time_slots:
            url = API_URL.replace('{START_DATETIME}', slot[0])
            url = url.replace('{END_DATETIME}', slot[1])
            task_list.append(url)
        else:
            print("Omitting previously processed slot: {0}".format(slot))

    return task_list
Exemple #2
0
def get_processed_time_slots_from_checkpoint():
    processed_slots = list()
    checkpoint_file_content = hdfs.readFileAsString(
        API_EXTRACTION_CHECKPOINT_FILE)

    for line in [x.strip() for x in checkpoint_file_content.split("\n")][1:]:
        if line == "":
            continue

        url = line.split("|")[0]
        hdfs_file = line.split("|")[2]

        print(url)
        print(hdfs_file)

        parsed_url = urlparse.urlparse(url)

        start_datetime = urlparse.parse_qs(
            parsed_url.query)['STARTDATETIME'][0]
        end_datetime = urlparse.parse_qs(parsed_url.query)['ENDDATETIME'][0]

        if hdfs.exists(hdfs_file):
            # file also has to exist
            processed_slots.append([start_datetime, end_datetime])

    return processed_slots
def getStopWords():
    URL = 'https://raw.githubusercontent.com/aneesha/RAKE/master/SmartStoplist.txt'
    path = '/tech/STOPWORDS.txt'
    if hdfs.exists(path):
        return
    response = urllib2.urlopen(URL)
    content = response.read()
    stopWords = content.split('\n')[1:]
    hdfs.write(path, '\n'.join(stopWords))
Exemple #4
0
def getCheckpointCount(path):
    if not hdfs.exists(path):
        return 0
    checkpointFileContent = hdfs.readFileAsString(path)
    counter = 0
    for line in checkpointFileContent.split('\n')[1:]:
        if '/data/gdelt' in line:
            counter = counter + 1
    return counter
def getCountries():
    URL = 'https://raw.githubusercontent.com/mysociety/gaze/master/data/fips-10-4-to-iso-country-codes.csv'
    path = hdfsPath + FILE_NAME.replace('{type}', 'country')
    if hdfs.exists(path):
        return
    cameoResponse = urllib2.urlopen(URL)
    cameoContent = cameoResponse.read()
    countries = []
    for line in cameoContent.split('\n')[1:]:
        splitted = line.split(',')
        if len(splitted) == 3:
            countries.append(splitted[0] + '\t' + splitted[2])
    hdfs.write(path, '\n'.join(countries))
Exemple #6
0
def getCheckpointsList(path, RUN_CONTROL_DATE):
    EXPORT_FILE_SUFFIX = '.export.csv'
    path = path.replace('{DATE}', str(RUN_CONTROL_DATE))
    if not hdfs.exists(path):
        return []

    checkpointFileContent = hdfs.readFileAsString(path)
    checkpointList = []
    for line in checkpointFileContent.split('\n')[1:]:
        if line == '':
            continue

        if not line.endswith(EXPORT_FILE_SUFFIX):
            errorMessage = str.format(
                '"%s" does not end with the suffix ".export.csv"', line)
            print(line)
            hdfs.log(LOG_PATH, errorMessage, True)
        else:
            splitted_line = line.split('/')
            pathDate = splitted_line[3]
            if pathDate == RUN_CONTROL_DATE:
                fileName = splitted_line[5].split('.')[0]
                checkpointList.append(fileName)
    return checkpointList
Exemple #7
0

def enqueueTasks(TASK_LIST, LIST_NAME):
    que = redis.Redis(host=REDIS_URL, port=6379)
    hdfs.log(LOG_PATH, 'Connected to Redis', False)
    que.delete(LIST_NAME)
    for task in TASK_LIST:
        que.lpush(LIST_NAME, str(task))
        hdfs.log(LOG_PATH,
                 'LeftPushed ' + str(task) + ' into ' + LIST_NAME + ' list',
                 False)
    que.client_kill_filter(_id=que.client_id())
    hdfs.log(LOG_PATH, 'Disconnected from Redis', False)


if not hdfs.exists(RUN_CONTROL_PATH):
    raise Exception('There is not tech file in ' + str(RUN_CONTROL_PATH))
DATE = hdfs.readFileAsString(RUN_CONTROL_PATH)
if DATE.endswith('\n'):
    DATE = DATE[:-1]

LOG_PATH = LOG_PATH.replace('{DATE}', DATE)
hdfs.mkdir(LOG_PATH)
LOG_PATH = LOG_PATH + '/extraction-csv.log'
if not hdfs.exists(LOG_PATH):
    hdfs.touch(LOG_PATH)
generateDirectoriesTree(DATE, DATA_DIR, DATA_SUBDIRS)
generateDirectoriesTree(DATE, DB_DIR, [])

if len(sys.argv) > 1:
    print("Max hour {}".format(sys.argv[1]))
Exemple #8
0
import hdfs_utils as hdfs
import urlparse
import redis
import sys

RUN_CONTROL_DATE_FILE = '/tech/RUN_CONTROL_DATE.dat'

# redis
REDIS_URL = 'redis-tasks'
QUE_NAME = 'API_DOWNLOAD'

# api
API_URL = 'https://api.gdeltproject.org/api/v2/doc/doc?query=sourcelang:english&format=json&STARTDATETIME={START_DATETIME}&ENDDATETIME={END_DATETIME}'

if not hdfs.exists(RUN_CONTROL_DATE_FILE):
    raise Exception('There is not tech file in ' + str(RUN_CONTROL_DATE_FILE))
RUN_CONTROL_DATE = hdfs.readFileAsString(RUN_CONTROL_DATE_FILE)
if RUN_CONTROL_DATE.endswith('\n'):
    RUN_CONTROL_DATE = RUN_CONTROL_DATE[:-1]

# logs
API_EXTRACTION_LOG_DIR = '/tech/extraction/{RUN_CONTROL_DATE}/log/'
API_EXTRACTION_LOG_FILE = '/tech/extraction/{RUN_CONTROL_DATE}/log/extraction-api.log'
API_EXTRACTION_LOG_DIR = API_EXTRACTION_LOG_DIR.replace(
    '{RUN_CONTROL_DATE}', RUN_CONTROL_DATE)
API_EXTRACTION_LOG_FILE = API_EXTRACTION_LOG_FILE.replace(
    '{RUN_CONTROL_DATE}', RUN_CONTROL_DATE)

# checkpoints
API_EXTRACTION_CHECKPOINT_DIR = '/tech/extraction/{RUN_CONTROL_DATE}/checkpoint'
Exemple #9
0
def assertExist(path):
    if not hdfs.exists(path):
        print("File '" + path + "' does not exist.")
    else:
        print("'{}'\t\tOK".format(path))
    HDFS_PATH = ACCEPT_PATH + CSV_FILENAME[0:-4] + '.csv'
    readAndPutToHdfs(CSV_FILENAME, HDFS_PATH)
    checkpoint(CHECKPOINT_PATH, HDFS_PATH)
    print("CHECKPOINT: " + HDFS_PATH)
    remove(ZIP_FILENAME)
    remove(CSV_FILENAME)


def parseTask(task):
    taskList = []
    for quotedTask in task[1][1:-1].split(', '):
        taskList.append(quotedTask[1:-1])
    return taskList


if not hdfs.exists(RUN_CONTROL_PATH):
    raise Exception('There is not tech file in ' + str(RUN_CONTROL_PATH))
DATE = hdfs.readFileAsString(RUN_CONTROL_PATH)
if DATE.endswith('\n'):
    DATE = DATE[:-1]

if not hdfs.exists(LOG_PATH):
    hdfs.touch(LOG_PATH)
REJECT_PATH = REJECT_PATH.replace('{DATE}', DATE)
ACCEPT_PATH = ACCEPT_PATH.replace('{DATE}', DATE)
LOG_PATH = LOG_PATH.replace('{DATE}', DATE)
CHECKPOINT_PATH = CHECKPOINT_PATH.replace('{DATE}', DATE)

if not hdfs.exists(CHECKPOINT_PATH):
    hdfs.write(CHECKPOINT_PATH, 'FINISH_DATE|FILE_LOCATION')
import hdfs_utils as hdfs
import urllib2

RUN_CONTROL_PATH = '/tech/RUN_CONTROL_DATE.dat'

if not hdfs.exists(RUN_CONTROL_PATH):
    raise Exception('There is not tech file in ' + str(RUN_CONTROL_PATH))
DATE = hdfs.readFileAsString(RUN_CONTROL_PATH)
if DATE.endswith('\n'):
    DATE = DATE[:-1]

FILE_NAME = 'CAMEO.{type}.txt'
URL = 'https://www.gdeltproject.org/data/lookups/CAMEO.{type}.txt'
TYPES = ['type', 'knowngroup', 'ethnic', 'religion', 'eventcodes']
hdfsPath = '/data/gdelt/' + str(DATE) + '/cameo/'
for type in TYPES:
    path = hdfsPath + FILE_NAME.replace('{type}', type)
    if hdfs.exists(path):
        continue
    cameoResponse = urllib2.urlopen(URL.replace('{type}', type))
    tmp = cameoResponse.read().split('\n')[1:]
    cameoContent = '\n'.join(tmp)
    hdfs.write(path, cameoContent)


def getCountries():
    URL = 'https://raw.githubusercontent.com/mysociety/gaze/master/data/fips-10-4-to-iso-country-codes.csv'
    path = hdfsPath + FILE_NAME.replace('{type}', 'country')
    if hdfs.exists(path):
        return
    cameoResponse = urllib2.urlopen(URL)
import os
import redis
import urlparse
import hdfs_utils as hdfs
import urllib2
import datetime

# redis
REDIS_URL = 'redis-tasks'
QUE_NAME = 'API_DOWNLOAD'

# RUN_CONTROL_DATE
RUN_CONTROL_DATE_FILE = '/tech/RUN_CONTROL_DATE.dat'

if not hdfs.exists(RUN_CONTROL_DATE_FILE):
    raise Exception('There is not tech file in ' + str(RUN_CONTROL_DATE_FILE))
RUN_CONTROL_DATE = hdfs.readFileAsString(RUN_CONTROL_DATE_FILE)
if RUN_CONTROL_DATE.endswith('\n'):
    RUN_CONTROL_DATE = RUN_CONTROL_DATE[:-1]

# logs
API_EXTRACTION_LOG_FILE = '/tech/extraction/{RUN_CONTROL_DATE}/log/extraction-api.log'
API_EXTRACTION_LOG_FILE = API_EXTRACTION_LOG_FILE.replace(
    '{RUN_CONTROL_DATE}', RUN_CONTROL_DATE)

# checkpoints
API_EXTRACTION_CHECKPOINT_FILE = '/tech/extraction/{RUN_CONTROL_DATE}/checkpoint/CHECKPOINT-API-{RUN_CONTROL_DATE}.checkpoint'
API_EXTRACTION_CHECKPOINT_FILE = API_EXTRACTION_CHECKPOINT_FILE.replace(
    '{RUN_CONTROL_DATE}', RUN_CONTROL_DATE)