def run(self): ''' The feeder function is called by the launcher and gives it a task to complete. ''' while True: try: filename = self.tasks.get(block=False) self.tasks.task_done() if filename == None: self.result.put(None) break elif filename.startswith('comments') or \ filename.startswith('article'): continue fh = file_utils.create_txt_filehandle(self.rts.txt, filename, 'r', 'utf-8') data = file_utils.read_unicode_text(fh) fh.close() for x, d in enumerate(data): d = d.strip().split('\t') data[x] = d #data = [d.strip() for d in data] #data = [d.split('\t') for d in data] sorted_data = mergesort(data) write_sorted_file(sorted_data, filename, self.rts) self.result.put(True) except UnicodeDecodeError, error: print 'Error: %s, (%s)' % (error, filename) except MemoryError, error: print 'Error: %s, (%s)' % (error, filename)
def store_json_diffs(rts): files = os.listdir(rts.diffs) #print files, rts.diffs db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset) buffer = cStringIO.StringIO() for filename in files: fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 'utf-8') for line in fh: if line.startswith('\n') or line.startswith('Start'): obj = buffer.getvalue() if obj != '': obj = json.loads(obj) obj[0]['article_id'] = int(obj[0]['article_id']) for key, value in obj[0].iteritems(): if key == 'timestamp': value = datetime.strptime(value, '%Y-%m-%dT%H:%M:%S') obj[0][key] = value obj = obj[0] try: db.save(obj) except bson.errors.InvalidDocument, error: print error buffer = cStringIO.StringIO() else: buffer.write(line) fh.close()
def assign_filehandle(fh, file_id, location, process_id, format): if not fh: file_id = 0 filename = '%s_%s.%s' % (file_id, process_id, format) fh = file_utils.create_txt_filehandle(location, filename, 'w', 'utf-8') else: size = fh.tell() max_size = 1024 * 1024 * 64 if size > max_size: fh.close() file_id += 1 filename = '%s_%s.%s' % (file_id, process_id, format) fh = file_utils.create_txt_filehandle(location, filename, 'w', 'utf-8') return fh, file_id
def run(self): db = storage.init_database(self.rts.storage, self.rts.dbname, self.rts.editors_raw) editor_cache = cache.EditorCache(db) while True: try: filename = self.tasks.get(block=False) self.tasks.task_done() if filename == None: self.result.put(None) break fh = file_utils.create_txt_filehandle(self.rts.sorted, filename, 'r', 'utf-8') data = [] for line in file_utils.read_raw_data(fh): if len(line) == 1: # or len(line) == 4: continue obs = prepare_data(line) if obs != {}: data.append(obs) if len(data) == 10000: db.insert(data, safe=False) data = [] if data != []: db.insert(data, safe=False) fh.close() self.result.put(True) except Empty: pass
def store_articles(tasks, rts): db = storage.init_database(rts.storage, rts.dbname, rts.articles_raw) filename = None while True: try: filename = tasks.get(block=False) tasks.task_done() if filename == None: break print 'Processing %s...' % filename fh = file_utils.create_txt_filehandle(rts.txt, filename, 'r', 'utf-8') for line in fh: line = line.strip() line = line.split('\t') data = {} x, y = 0, 1 while y < len(line): key, value = line[x], line[y] if key == 'ns' or key == 'id': data[key] = int(value) else: data[key] = value x += 2 y += 2 db.insert(data) fh.close() except Empty: pass print 'Finished processing %s...' % filename
def __init__(self, process_id, rts, fhd): super(CSVBuffer, self).__init__(rts, process_id) self.fhd = fhd self.comments = {} self.articles = {} self.filehandles = [ file_utils.create_txt_filehandle(self.rts.txt, file_id, 'a', 'utf-8') for file_id in xrange(self.rts.max_filehandles) ] self.fh_articles = file_utils.create_txt_filehandle( self.rts.txt, 'articles_%s' % self.process_id, 'w', 'utf-8') self.fh_comments = file_utils.create_txt_filehandle( self.rts.txt, 'comments_%s' % self.process_id, 'w', 'utf-8') self.fh_article_meta = file_utils.create_txt_filehandle( self.rts.txt, 'articles_meta_%s' % self.process_id, 'w', 'utf-8')
def write_sorted_file(sorted_data, filename, rts): ''' Writes the sorted file to target ''' fh = file_utils.create_txt_filehandle(rts.sorted, filename, 'w', 'utf-8') file_utils.write_list_to_csv(sorted_data, fh) fh.close()
def store_diffs_debug(rts): db = storage.init_database(rts) db.drop_collection() files = os.listdir(rts.diffs) for filename in files: fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 'utf-8') diffs = json.load(fh) db.insert(diffs) fh.close()
def bot_training_dataset(bots): fh = file_utils.create_txt_filehandle(settings.csv_location, 'training_bots.csv', 'w', 'utf-8') keys = bots.keys() for key in keys: bot = bots.get(key) bot.hours_active() bot.avg_lag_between_edits() bot.write_training_dataset(fh) fh.close()
def write_bot_list_to_csv(bots, keys): fh = file_utils.create_txt_filehandle(settings.csv_location, 'bots_ids.csv', 'w', 'utf-8') bot_dict = convert_object_to_dict(bots, exclude=['time', 'written']) for bot in bot_dict: bot = bot_dict[bot] file_utils.write_dict_to_csv(bot, fh, keys, write_key=False, newline=True) fh.close()
def merge_sorted_files(target, files): ''' Merges smaller sorted files in one big file, Only used for creating data competition file. ''' fh = file_utils.create_txt_filehandle(target, 'kaggle.csv', 'w', 'utf-8') lines = 0 for line in heapq.merge(*[readline(filename) for filename in files]): file_utils.write_list_to_csv(line, fh) lines += 1 fh.close() print 'Total number of edits: %s ' % lines return fh.name
def to_csv(self, filename): data = data_converter.convert_dataset_to_lists(self, 'manage') headers = data_converter.add_headers(self) lock = RLock() fh = file_utils.create_txt_filehandle(settings.dataset_location, filename, 'w', 'utf-8') file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True) file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format, lock=lock) fh.close()
def create_edgelist(project, collection): db = storage.init_database('mongo', project, collection) ids = db.retrieve_distinct_keys('editor') ids.sort() fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', 'utf-8') for i in ids: author_i = db.find_one({'editor': i}) if author_i != None: article_i = create_articles_set(author_i['edits']) for j in ids: if i > j: author_j = db.find_one({'editor': j}) article_j = create_articles_set(author_j['edits']) common = article_i.intersection(article_j) if len(common) > 0: file_utils.write_list_to_csv([i, j, len(common)], fh, recursive=False, newline=True) fh.close()
def download_wiki_file(task_queue, rts): ''' This is a very simple replacement for wget and curl because Windows does not have these tools installed by default ''' success = True chunk = 1024 * 4 while True: filename = task_queue.get(block=False) task_queue.task_done() if filename == None: print 'Swallowed a poison pill' break widgets = log.init_progressbar_widgets(filename) extension = os.path.splitext(filename)[1] filemode = file_utils.determine_file_mode(extension) filesize = http_utils.determine_remote_filesize( rts.wp_dump_location, rts.dump_relative_path, filename) mod_date = http_utils.determine_modified_date(rts.wp_dump_location, rts.dump_relative_path, filename) mod_date = text_utils.convert_timestamp_to_datetime_naive( mod_date, rts.timestamp_server) if file_utils.check_file_exists(rts.input_location, filename): mod_loc = file_utils.get_modified_date(rts.input_location, filename) if mod_loc == mod_date and (rts.force == False or rts.force == None): print 'You already have downloaded the most recent %s%s dumpfile.' % ( rts.language.code, rts.project.name) continue if filemode == 'w': fh = file_utils.create_txt_filehandle(rts.input_location, filename, filemode, rts.encoding) else: fh = file_utils.create_binary_filehandle(rts.input_location, filename, 'wb') if filesize != -1: pbar = progressbar.ProgressBar(widgets=widgets, maxval=filesize).start() else: pbar = progressbar.ProgressBar(widgets=widgets).start() try: path = '%s%s' % (rts.dump_absolute_path, filename) req = urllib2.Request(path) response = urllib2.urlopen(req) while True: data = response.read(chunk) if not data: print 'Finished downloading %s.' % (path) break fh.write(data) filesize -= chunk if filesize < 0: chunk = chunk + filesize pbar.update(pbar.currval + chunk) except urllib2.URLError, error: print 'Reason: %s' % error except urllib2.HTTPError, error: print 'Error: %s' % error