Ejemplo n.º 1
0
    global buff_meta
    if len(buff_meta) > 0:
        print('writing last set of meta')

        m = hashlib.md5()
        m.update(buff_meta[0].SerializeToString())
        curr_hash = m.hexdigest()
        writer = open(os.path.join(args.output, curr_hash), 'bw')
        decoder.encodeEntry(writer.write, buff_meta)
        writer.close()
        if args.mzip:
            subprocess.call([
                "7z", 'a', '-mx9',
                os.path.join(args.output, curr_hash) + ".7z",
                os.path.join(args.output, curr_hash)
            ])
            os.remove(os.path.join(args.output, curr_hash))
        buff_meta = []
        size_meta = 0


if __name__ == "__main__":
    args = parser.parse_args()
    print("start")
    production = buffer_utils.extract_files(args.input, read_file, args.match,
                                            True, args.after, args.before,
                                            args.temp)
    for item in map(consumer_metadata, production):
        pass
    finalize()
Ejemplo n.º 2
0
    if open_[item]['count'] > 0:
      yield (item,open_[item]['date'],-1,id_,doc.title)
def get_items(doc_index,sites,link,tld):
  url = link[0]
  try:
    site = utils.get_domain(url,tld)
  except ValueError:
    return None
  except Exception as inst:
    raise inst
  if site in sites:
    sites[site] += 1
  else:
    sites[site] = 1
  return (doc_index,list(sites.keys()).index(site),link[1],link[2],link[3],link[4])

if __name__ == '__main__':
  args = parser.parse_args()
  docs = buffer_utils.extract_files(args.input,read_file, args.match, False, args.after, args.before)
  docs_items = map(doc_extract_links,docs)
  tld = utils.get_tld_list(args.ltd_names)
  sites = OrderedDict()
  doc_index = 0
  final_items = []
  for items in docs_items:
    final_items.extend([t for t in filter(lambda x: x is not None,map(lambda x:get_items(doc_index,sites,x,tld),items))])
    doc_index +=1
  fi = open(args.output,'wb')
  pickle.dump({'data':final_items,'sites':sites},fi)
  fi.close()
    size_meta = 0
  if curr_file_index != file_name:
    curr_file_index = file_name
  print("%d elements in buffer" % len(buff_meta))
  size_meta += len(item_.SerializeToString())
  buff_meta.append(item_)

def finalize():
  global buff_meta
  if len(buff_meta) > 0:
    print('writing last set of meta')

    m = hashlib.md5()
    m.update(buff_meta[0].SerializeToString())
    curr_hash = m.hexdigest()
    writer = open(os.path.join(args.output,curr_hash),'bw')
    decoder.encodeEntry(writer.write,buff_meta)
    writer.close()
    if args.mzip:
      subprocess.call(["7z",'a','-mx9',os.path.join(args.output,curr_hash)+".7z",os.path.join(args.output,curr_hash)])
      os.remove(os.path.join(args.output,curr_hash))
    buff_meta = []
    size_meta = 0
if __name__ == "__main__":
  args = parser.parse_args()
  print("start")
  production = buffer_utils.extract_files(args.input, read_file, args.match, True, args.after, args.before,args.temp)
  for item in map(consumer_metadata,production):
    pass
  finalize()