def get_new_tasks_list(end_hour=24): time_slots = generate_daily_time_slots(end_hour) task_list = [] already_processed_time_slots = get_processed_time_slots_from_checkpoint() for slot in time_slots: INTERVAL = slot[0][-6:] + "-" + slot[1][-6:] # images id = IMAGES_DIR.replace('{INTERVAL}', INTERVAL) if not hdfs.exists(id): hdfs.mkdir(id) # texts td = TEXTS_DIR.replace('{INTERVAL}', INTERVAL) if not hdfs.exists(td): hdfs.mkdir(td) if slot not in already_processed_time_slots: url = API_URL.replace('{START_DATETIME}', slot[0]) url = url.replace('{END_DATETIME}', slot[1]) task_list.append(url) else: print("Omitting previously processed slot: {0}".format(slot)) return task_list
def get_processed_time_slots_from_checkpoint(): processed_slots = list() checkpoint_file_content = hdfs.readFileAsString( API_EXTRACTION_CHECKPOINT_FILE) for line in [x.strip() for x in checkpoint_file_content.split("\n")][1:]: if line == "": continue url = line.split("|")[0] hdfs_file = line.split("|")[2] print(url) print(hdfs_file) parsed_url = urlparse.urlparse(url) start_datetime = urlparse.parse_qs( parsed_url.query)['STARTDATETIME'][0] end_datetime = urlparse.parse_qs(parsed_url.query)['ENDDATETIME'][0] if hdfs.exists(hdfs_file): # file also has to exist processed_slots.append([start_datetime, end_datetime]) return processed_slots
def getStopWords(): URL = 'https://raw.githubusercontent.com/aneesha/RAKE/master/SmartStoplist.txt' path = '/tech/STOPWORDS.txt' if hdfs.exists(path): return response = urllib2.urlopen(URL) content = response.read() stopWords = content.split('\n')[1:] hdfs.write(path, '\n'.join(stopWords))
def getCheckpointCount(path): if not hdfs.exists(path): return 0 checkpointFileContent = hdfs.readFileAsString(path) counter = 0 for line in checkpointFileContent.split('\n')[1:]: if '/data/gdelt' in line: counter = counter + 1 return counter
def getCountries(): URL = 'https://raw.githubusercontent.com/mysociety/gaze/master/data/fips-10-4-to-iso-country-codes.csv' path = hdfsPath + FILE_NAME.replace('{type}', 'country') if hdfs.exists(path): return cameoResponse = urllib2.urlopen(URL) cameoContent = cameoResponse.read() countries = [] for line in cameoContent.split('\n')[1:]: splitted = line.split(',') if len(splitted) == 3: countries.append(splitted[0] + '\t' + splitted[2]) hdfs.write(path, '\n'.join(countries))
def getCheckpointsList(path, RUN_CONTROL_DATE): EXPORT_FILE_SUFFIX = '.export.csv' path = path.replace('{DATE}', str(RUN_CONTROL_DATE)) if not hdfs.exists(path): return [] checkpointFileContent = hdfs.readFileAsString(path) checkpointList = [] for line in checkpointFileContent.split('\n')[1:]: if line == '': continue if not line.endswith(EXPORT_FILE_SUFFIX): errorMessage = str.format( '"%s" does not end with the suffix ".export.csv"', line) print(line) hdfs.log(LOG_PATH, errorMessage, True) else: splitted_line = line.split('/') pathDate = splitted_line[3] if pathDate == RUN_CONTROL_DATE: fileName = splitted_line[5].split('.')[0] checkpointList.append(fileName) return checkpointList
def enqueueTasks(TASK_LIST, LIST_NAME): que = redis.Redis(host=REDIS_URL, port=6379) hdfs.log(LOG_PATH, 'Connected to Redis', False) que.delete(LIST_NAME) for task in TASK_LIST: que.lpush(LIST_NAME, str(task)) hdfs.log(LOG_PATH, 'LeftPushed ' + str(task) + ' into ' + LIST_NAME + ' list', False) que.client_kill_filter(_id=que.client_id()) hdfs.log(LOG_PATH, 'Disconnected from Redis', False) if not hdfs.exists(RUN_CONTROL_PATH): raise Exception('There is not tech file in ' + str(RUN_CONTROL_PATH)) DATE = hdfs.readFileAsString(RUN_CONTROL_PATH) if DATE.endswith('\n'): DATE = DATE[:-1] LOG_PATH = LOG_PATH.replace('{DATE}', DATE) hdfs.mkdir(LOG_PATH) LOG_PATH = LOG_PATH + '/extraction-csv.log' if not hdfs.exists(LOG_PATH): hdfs.touch(LOG_PATH) generateDirectoriesTree(DATE, DATA_DIR, DATA_SUBDIRS) generateDirectoriesTree(DATE, DB_DIR, []) if len(sys.argv) > 1: print("Max hour {}".format(sys.argv[1]))
import hdfs_utils as hdfs import urlparse import redis import sys RUN_CONTROL_DATE_FILE = '/tech/RUN_CONTROL_DATE.dat' # redis REDIS_URL = 'redis-tasks' QUE_NAME = 'API_DOWNLOAD' # api API_URL = 'https://api.gdeltproject.org/api/v2/doc/doc?query=sourcelang:english&format=json&STARTDATETIME={START_DATETIME}&ENDDATETIME={END_DATETIME}' if not hdfs.exists(RUN_CONTROL_DATE_FILE): raise Exception('There is not tech file in ' + str(RUN_CONTROL_DATE_FILE)) RUN_CONTROL_DATE = hdfs.readFileAsString(RUN_CONTROL_DATE_FILE) if RUN_CONTROL_DATE.endswith('\n'): RUN_CONTROL_DATE = RUN_CONTROL_DATE[:-1] # logs API_EXTRACTION_LOG_DIR = '/tech/extraction/{RUN_CONTROL_DATE}/log/' API_EXTRACTION_LOG_FILE = '/tech/extraction/{RUN_CONTROL_DATE}/log/extraction-api.log' API_EXTRACTION_LOG_DIR = API_EXTRACTION_LOG_DIR.replace( '{RUN_CONTROL_DATE}', RUN_CONTROL_DATE) API_EXTRACTION_LOG_FILE = API_EXTRACTION_LOG_FILE.replace( '{RUN_CONTROL_DATE}', RUN_CONTROL_DATE) # checkpoints API_EXTRACTION_CHECKPOINT_DIR = '/tech/extraction/{RUN_CONTROL_DATE}/checkpoint'
def assertExist(path): if not hdfs.exists(path): print("File '" + path + "' does not exist.") else: print("'{}'\t\tOK".format(path))
HDFS_PATH = ACCEPT_PATH + CSV_FILENAME[0:-4] + '.csv' readAndPutToHdfs(CSV_FILENAME, HDFS_PATH) checkpoint(CHECKPOINT_PATH, HDFS_PATH) print("CHECKPOINT: " + HDFS_PATH) remove(ZIP_FILENAME) remove(CSV_FILENAME) def parseTask(task): taskList = [] for quotedTask in task[1][1:-1].split(', '): taskList.append(quotedTask[1:-1]) return taskList if not hdfs.exists(RUN_CONTROL_PATH): raise Exception('There is not tech file in ' + str(RUN_CONTROL_PATH)) DATE = hdfs.readFileAsString(RUN_CONTROL_PATH) if DATE.endswith('\n'): DATE = DATE[:-1] if not hdfs.exists(LOG_PATH): hdfs.touch(LOG_PATH) REJECT_PATH = REJECT_PATH.replace('{DATE}', DATE) ACCEPT_PATH = ACCEPT_PATH.replace('{DATE}', DATE) LOG_PATH = LOG_PATH.replace('{DATE}', DATE) CHECKPOINT_PATH = CHECKPOINT_PATH.replace('{DATE}', DATE) if not hdfs.exists(CHECKPOINT_PATH): hdfs.write(CHECKPOINT_PATH, 'FINISH_DATE|FILE_LOCATION')
import hdfs_utils as hdfs import urllib2 RUN_CONTROL_PATH = '/tech/RUN_CONTROL_DATE.dat' if not hdfs.exists(RUN_CONTROL_PATH): raise Exception('There is not tech file in ' + str(RUN_CONTROL_PATH)) DATE = hdfs.readFileAsString(RUN_CONTROL_PATH) if DATE.endswith('\n'): DATE = DATE[:-1] FILE_NAME = 'CAMEO.{type}.txt' URL = 'https://www.gdeltproject.org/data/lookups/CAMEO.{type}.txt' TYPES = ['type', 'knowngroup', 'ethnic', 'religion', 'eventcodes'] hdfsPath = '/data/gdelt/' + str(DATE) + '/cameo/' for type in TYPES: path = hdfsPath + FILE_NAME.replace('{type}', type) if hdfs.exists(path): continue cameoResponse = urllib2.urlopen(URL.replace('{type}', type)) tmp = cameoResponse.read().split('\n')[1:] cameoContent = '\n'.join(tmp) hdfs.write(path, cameoContent) def getCountries(): URL = 'https://raw.githubusercontent.com/mysociety/gaze/master/data/fips-10-4-to-iso-country-codes.csv' path = hdfsPath + FILE_NAME.replace('{type}', 'country') if hdfs.exists(path): return cameoResponse = urllib2.urlopen(URL)
import os import redis import urlparse import hdfs_utils as hdfs import urllib2 import datetime # redis REDIS_URL = 'redis-tasks' QUE_NAME = 'API_DOWNLOAD' # RUN_CONTROL_DATE RUN_CONTROL_DATE_FILE = '/tech/RUN_CONTROL_DATE.dat' if not hdfs.exists(RUN_CONTROL_DATE_FILE): raise Exception('There is not tech file in ' + str(RUN_CONTROL_DATE_FILE)) RUN_CONTROL_DATE = hdfs.readFileAsString(RUN_CONTROL_DATE_FILE) if RUN_CONTROL_DATE.endswith('\n'): RUN_CONTROL_DATE = RUN_CONTROL_DATE[:-1] # logs API_EXTRACTION_LOG_FILE = '/tech/extraction/{RUN_CONTROL_DATE}/log/extraction-api.log' API_EXTRACTION_LOG_FILE = API_EXTRACTION_LOG_FILE.replace( '{RUN_CONTROL_DATE}', RUN_CONTROL_DATE) # checkpoints API_EXTRACTION_CHECKPOINT_FILE = '/tech/extraction/{RUN_CONTROL_DATE}/checkpoint/CHECKPOINT-API-{RUN_CONTROL_DATE}.checkpoint' API_EXTRACTION_CHECKPOINT_FILE = API_EXTRACTION_CHECKPOINT_FILE.replace( '{RUN_CONTROL_DATE}', RUN_CONTROL_DATE)