def handle_task(task, run_control_date):
    print("HANDLING: " + str(task))

    try:
        url = task[0]
        parsed_url = urlparse.urlparse(url)

        start_datetime = urlparse.parse_qs(
            parsed_url.query)['STARTDATETIME'][0]
        start_time = start_datetime[-6:]
        end_datetime = urlparse.parse_qs(parsed_url.query)['ENDDATETIME'][0]
        end_time = end_datetime[-6:]

        response = urllib2.urlopen(url)
        json_content = response.read()

        article_info_json = ARTICLE_INFO_JSON.replace(
            '{INTERVAL}', start_time + "-" + end_time)

        hdfs.write(article_info_json, json_content)
    except Exception as e:
        print(e)
        hdfs.log(API_EXTRACTION_LOG_FILE,
                 'Error {0} while working on task {1}'.format(e, task), False)
        return

    hdfs.append(
        API_EXTRACTION_CHECKPOINT_FILE,
        '{}|{}|{}'.format(url,
                          datetime.datetime.now().strftime('%Y%m%d%H%M%S'),
                          article_info_json))
def handleTask(TASK, DATE):
    print("HANDLING: " + str(TASK))
    ZIP_FILENAME = TASK[2].split('/')[-1]

    zipResponse = urllib2.urlopen(TASK[2])
    zipContent = zipResponse.read()
    if not checkMd5Sum(zipContent, TASK[1]):
        reject(zipContent, ZIP_FILENAME, DATE)
        hdfs.write(REJECT_PATH + ZIP_FILENAME, zipContent)
        return
    else:
        hdfs.log(LOG_PATH, ZIP_FILENAME + ' has correct md5Sum value', False)

    saveFileAs(zipContent, ZIP_FILENAME)
    print("SAVED: " + ZIP_FILENAME)
    with zipfile.ZipFile(ZIP_FILENAME, 'r') as zip_ref:
        zip_ref.extractall('.')
    CSV_FILENAME = ZIP_FILENAME[0:-4]
    print("UNZIPPED: " + CSV_FILENAME)

    HDFS_PATH = ACCEPT_PATH + CSV_FILENAME[0:-4] + '.csv'
    readAndPutToHdfs(CSV_FILENAME, HDFS_PATH)
    checkpoint(CHECKPOINT_PATH, HDFS_PATH)
    print("CHECKPOINT: " + HDFS_PATH)
    remove(ZIP_FILENAME)
    remove(CSV_FILENAME)
Beispiel #3
0
def set_up(max_hour):
    print("Setting up {}".format(RUN_CONTROL_DATE_PATH))
    delete_recursively(RUN_CONTROL_DATE_PATH)
    write(RUN_CONTROL_DATE_PATH, Config.RUN_CONTROL_DATE)

    print("Setting up dictionaries {}".format(DICTIONARIES_PATH))
    subprocess.check_output('python2.7 ../../acquisition/get_dictionaries.py', shell=True)

    # download api
    print("Setting up api {} for hour time slot [0, {}]".format(ARTICLE_INFO_JSON, max_hour))
    subprocess.check_output('python2.7 ../../acquisition/generate_tasks_api.py {}'.format(max_hour),
                            shell=True)
    subprocess.check_output('python2.7 ../../acquisition/download_api.py', shell=True)
    for path in listPath(ARTICLE_INFO_JSON):
        print("\t {}".format(path))

    # download csv
    print("Setting up csv {} for hour time slot [0, {}]".format(ARTICLE_CSV, max_hour))
    subprocess.check_output('python2.7 ../../acquisition/generate_tasks.py {}'.format(max_hour),
                            shell=True)
    subprocess.check_output('python2.7 ../../acquisition/download_csv.py', shell=True)
    for path in listPath(ARTICLE_CSV):
        print("\t {}".format(path))

    # csv distinct
    print("Running csv-distinct")
    subprocess.check_output('../run_processing.sh distinct ../csvdistinct/target/csv-distinct-1.0-SNAPSHOT.jar',
                            shell=True)

    # map country
    print("Running country-mapping")
    subprocess.check_output('../run_processing.sh country ../country-mapping/target/country-mapping-1.0-SNAPSHOT.jar',
                            shell=True)
def getStopWords():
    URL = 'https://raw.githubusercontent.com/aneesha/RAKE/master/SmartStoplist.txt'
    path = '/tech/STOPWORDS.txt'
    if hdfs.exists(path):
        return
    response = urllib2.urlopen(URL)
    content = response.read()
    stopWords = content.split('\n')[1:]
    hdfs.write(path, '\n'.join(stopWords))
def getCountries():
    URL = 'https://raw.githubusercontent.com/mysociety/gaze/master/data/fips-10-4-to-iso-country-codes.csv'
    path = hdfsPath + FILE_NAME.replace('{type}', 'country')
    if hdfs.exists(path):
        return
    cameoResponse = urllib2.urlopen(URL)
    cameoContent = cameoResponse.read()
    countries = []
    for line in cameoContent.split('\n')[1:]:
        splitted = line.split(',')
        if len(splitted) == 3:
            countries.append(splitted[0] + '\t' + splitted[2])
    hdfs.write(path, '\n'.join(countries))
Beispiel #6
0
        hdfs.log(API_EXTRACTION_LOG_FILE,
                 'LeftPushed ' + str(task) + ' into ' + QUE_NAME + ' list',
                 False)

    que.client_kill_filter(_id=que.client_id())

    hdfs.log(API_EXTRACTION_LOG_FILE, 'Disconnected from Redis', False)


# log
if not hdfs.exists(API_EXTRACTION_LOG_DIR):
    hdfs.mkdir(API_EXTRACTION_LOG_DIR)
if not hdfs.exists(API_EXTRACTION_LOG_FILE):
    hdfs.touch(API_EXTRACTION_LOG_FILE)

# checkpoint
if not hdfs.exists(API_EXTRACTION_CHECKPOINT_DIR):
    hdfs.mkdir(API_EXTRACTION_CHECKPOINT_DIR)
if not hdfs.exists(API_EXTRACTION_CHECKPOINT_FILE):
    hdfs.write(API_EXTRACTION_CHECKPOINT_FILE,
               'API_URL|FINISH_DATETIME|FILE_LOCATION')

if len(sys.argv) > 1:
    print("Max hour {}".format(sys.argv[1]))
    new_tasks = get_new_tasks_list(sys.argv[1])
else:
    print("Max hour {}".format(24))
    new_tasks = get_new_tasks_list()

enqueue_tasks(new_tasks)
def readAndPutToHdfs(path, hdfs_path):
    file = open(path, 'r')
    fileContent = file.read()
    hdfs.write(hdfs_path, fileContent)
    file.close()
def reject(zipContent, fileName, DATE):
    print("REJECT: " + fileName)
    hdfs.log(LOG_PATH, 'Reject file "' + fileName + '"', True)
    hdfs.write(REJECT_PATH + fileName, zipContent)
    return taskList


if not hdfs.exists(RUN_CONTROL_PATH):
    raise Exception('There is not tech file in ' + str(RUN_CONTROL_PATH))
DATE = hdfs.readFileAsString(RUN_CONTROL_PATH)
if DATE.endswith('\n'):
    DATE = DATE[:-1]

if not hdfs.exists(LOG_PATH):
    hdfs.touch(LOG_PATH)
REJECT_PATH = REJECT_PATH.replace('{DATE}', DATE)
ACCEPT_PATH = ACCEPT_PATH.replace('{DATE}', DATE)
LOG_PATH = LOG_PATH.replace('{DATE}', DATE)
CHECKPOINT_PATH = CHECKPOINT_PATH.replace('{DATE}', DATE)

if not hdfs.exists(CHECKPOINT_PATH):
    hdfs.write(CHECKPOINT_PATH, 'FINISH_DATE|FILE_LOCATION')

que = redis.Redis(host=REDIS_URL, port=6379)

isEmpty = False
while not isEmpty:
    task = que.blpop(QUE_NAME, timeout=1)
    if task == None:
        isEmpty = True
        print("EMPTY QUEUE")
    else:
        handleTask(parseTask(task), DATE)

que.client_kill_filter(_id=que.client_id())
DATE = hdfs.readFileAsString(RUN_CONTROL_PATH)
if DATE.endswith('\n'):
    DATE = DATE[:-1]

FILE_NAME = 'CAMEO.{type}.txt'
URL = 'https://www.gdeltproject.org/data/lookups/CAMEO.{type}.txt'
TYPES = ['type', 'knowngroup', 'ethnic', 'religion', 'eventcodes']
hdfsPath = '/data/gdelt/' + str(DATE) + '/cameo/'
for type in TYPES:
    path = hdfsPath + FILE_NAME.replace('{type}', type)
    if hdfs.exists(path):
        continue
    cameoResponse = urllib2.urlopen(URL.replace('{type}', type))
    tmp = cameoResponse.read().split('\n')[1:]
    cameoContent = '\n'.join(tmp)
    hdfs.write(path, cameoContent)


def getCountries():
    URL = 'https://raw.githubusercontent.com/mysociety/gaze/master/data/fips-10-4-to-iso-country-codes.csv'
    path = hdfsPath + FILE_NAME.replace('{type}', 'country')
    if hdfs.exists(path):
        return
    cameoResponse = urllib2.urlopen(URL)
    cameoContent = cameoResponse.read()
    countries = []
    for line in cameoContent.split('\n')[1:]:
        splitted = line.split(',')
        if len(splitted) == 3:
            countries.append(splitted[0] + '\t' + splitted[2])
    hdfs.write(path, '\n'.join(countries))