-
Notifications
You must be signed in to change notification settings - Fork 0
/
Downloader.py
114 lines (85 loc) · 3.96 KB
/
Downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# encoding: utf-8
""" Downloader.py """
from urllib import urlretrieve
from gzip import open as gz_open
from os.path import join
from os import remove
from time import time, gmtime, struct_time
from calendar import timegm
from logging import getLogger
from FileSystemWorker import create_or_check_path, json_file_load
class Downloader(object):
""" Methods of downloading and saving GitHub archives. """
def __init__(self):
self.downloaded_data_dir = "downloaded_data/"
self.new_data_dir = "new_data/"
self.config_file = join("config/", "downloader.conf")
create_or_check_path(self.downloaded_data_dir)
create_or_check_path(self.new_data_dir)
create_or_check_path(self.config_file)
self.config = self.configuration()
self.logger = getLogger('LOGGER')
def configuration(self):
"""Invoke downloader configuration.
:return: configuration from config file if invoking was successful else default configuration
"""
config = json_file_load(self.config_file)
default = {"last_connection_time": struct_time((2014, 5, 9, 12, 0, 0, 3, 134, 0))}
return config or default
def download_file(self, name):
"""Download file from GitHub archive.
:param name: name of GitHub archive in format YYYY-MM-DD-h
:return: name of JSON file with data if downloading was successful else None
"""
#TODO: handle exceptions
archive_name = name + ".json.gz"
file_name = join(self.new_data_dir, name + ".json")
try:
urlretrieve("http://data.githubarchive.org/" + archive_name,
filename=join(self.downloaded_data_dir, archive_name))
except IOError:
self.logger.error(__name__ + ": " + "unable to download file (error creating connection).")
try:
archive = gz_open(join(self.downloaded_data_dir, archive_name))
except IOError:
self.logger.error(__name__ + ": " + "unable to open gzipped file (file not created).")
else:
json_file = open(file_name, "w")
json_file.write(archive.read())
archive.close()
json_file.close()
remove(join(self.downloaded_data_dir, archive_name))
return file_name
def download_archive(self):
"""Get data from GitHub server.
:return: name of JSON file with data if downloading was successful else None
"""
def time_convert(structure):
"""
:param structure: tuple representation of time
:return: GitHub archive time
"""
join_number_to_zero = lambda number: ("" if number > 9 else "0") + str(number)
return "%s-%s-%s-%s" % (
structure.tm_year, join_number_to_zero(structure.tm_mon), join_number_to_zero(structure.tm_mday),
structure.tm_hour)
current_time = self.get_time()
self.logger.debug(__name__ + ": " + "current time: " + str(gmtime(current_time)))
difference = -25200
#timezone difference in seconds between GMT and west coast of USA
downloading_time = int(timegm(self.config["last_connection_time"])) + 3600
self.logger.debug(__name__ + ": " + "downloading time: " + str(gmtime(downloading_time)))
if downloading_time > current_time - 7200:
self.logger.info(__name__ + ": " + "unable to download file (time limiting).")
return
downloading_time += difference
json_file_name = self.download_file(time_convert(gmtime(downloading_time)))
self.config["last_connection_time"] = gmtime(downloading_time - difference)
self.logger.debug(__name__ + ": " + "last_connection_time: " + str(self.config["last_connection_time"]))
return json_file_name
@staticmethod
def get_time():
"""
:return: current time in seconds since the Epoch
"""
return int(time())