Esempio n. 1
0
 def run(self):
     zip_files = glob.glob(os.path.join(self.args.src, 
                                        ALEXA_ZIP_FILE_FORMAT))
     zip_files.sort()
     for zip_f in zip_files:
         basename = os.path.basename(zip_f)
         date = midas.parse_tstamp(basename, ALEXA_TS_FORMAT)
         tstamp = midas.serialize_tstamp(date)
         dst_fname = 'top_1m_{0}'.format(tstamp)
         dst_f = os.path.join(self.args.dst, dst_fname)
         if os.path.isfile(dst_f) or os.stat(zip_f).st_size == 0:
             self.out('Skipping {0}'.format(basename))
         else:
             with open(dst_f, 'w') as fp:
                 for site, rank in iter_alexa_zip_file(zip_f):
                     fp.write('{0}\t{1}\t{2}\n'.format(site, tstamp, rank))
             self.out('Processed {0}'.format(basename))
Esempio n. 2
0
 def ids_to_samples(self):
     if self.args.samples:
         directory = self.args.samples
     else:
         directory = self.config['samples']
     if os.path.isfile(directory):
         files = [directory, ]
     else:
         files = []
         make_abs = functools.partial(os.path.join, directory)
         for path in imap(make_abs, os.listdir(directory)):
             if os.path.isfile(path):
                 files.append(path)
     samples = dict()
     for f in files:
         for site, tstamp, code in csv_file_reader(f, delimiter='\t'):
             tstamp = parse_tstamp(tstamp)
             site_id = self.sites_to_ids[site]
             samples[site_id] = (site, tstamp, code)
     return samples