def main(): input_file = sys.argv[1] if "-cf" in sys.argv: WarcHTMLParser.case_folding = True if "-sw" in sys.argv: WarcHTMLParser.stopword_remove = True if "-st" in sys.argv: WarcHTMLParser.stemming = True parser = Parser(input_file) # skip header record parser.fetch() starttime = time.time() print("Start......") count, index = start_parse(parser) print("dump index from memory to file") dump_start_time = time.time() index.dump(input_file + ".index") dump_end_time = time.time() print("----------------------------------------------------------") print("dump index:") print(dump_end_time - dump_start_time, "s") print("----------------------------------------------------------") print("finish") print("----------------------------------------------------------") print("Total time analysis:") print(time.time() - starttime, "s") print("Average", (time.time() - starttime) * 1000 / count, "ms") print("DPS", count / (time.time() - starttime), "ps")
def multi_version(_parser: Parser) -> int: multiprocessing.freeze_support() pool = multiprocessing.Pool() count = 0 result = [] start_time = time.time() tmp_dir_name = "tmp/" + get_temp_dir_name() print(tmp_dir_name) os.mkdir(tmp_dir_name) while True: d = _parser.fetch() count += 1 if d is not None: result.append(pool.apply_async(processing_async, (count, d.content,))) if count % 1000 == 0: print("waiting...", int(count / 1000)) pool.close() pool.join() for r in result: cnt, idx = r.get() idx.dump(tmp_dir_name + "/" + str(cnt)) pool = multiprocessing.Pool() result = [] else: pool.close() pool.join() for r in result: cnt, idx = r.get() idx.dump(tmp_dir_name + "/" + str(cnt)) break print("----------------------------------------------------------") print("Analysis document:") end_time = time.time() print(end_time - start_time, "s") print("Average:", (end_time - start_time) * 1000 / count, "ms") print("Document process per second:", count / (end_time - start_time), "ps") print("----------------------------------------------------------") print("build full index ......") start_time = time.time() idx = Index() for i in range(1, count): if count % 500 == 0: gc.collect() idx.read_partial_index(i, PartialIndex.read(tmp_dir_name + "/" + str(i))) os.remove(tmp_dir_name + "/" + str(i)) try: os.rmdir(tmp_dir_name) except Exception: pass print("----------------------------------------------------------") print("Build full index:") end_time = time.time() print(end_time - start_time, "s") print("Average:", (end_time - start_time) * 1000 / count, "ms") print("DPS:", count / (end_time - start_time), "ps") print("----------------------------------------------------------") return count, idx
def main(): f = sys.argv[1] multi_flag = False gzip_flag = False if "-st" in sys.argv: WarcHTMLParser.stemming = True if "-sw" in sys.argv: WarcHTMLParser.stopword_remove = True if "-cf" in sys.argv: WarcHTMLParser.case_folding = True if "-gz" in sys.argv: gzip_flag = True if "-m" in sys.argv: multi_flag = True p = Parser(f) # skip header record p.fetch() starttime = time.time() print("Start......") if multi_flag: count, index = multi_version(p) else: count, index = single_version(p) print("dump index from memory to file " + f + ".index.txt") dump_start_time = time.time() if gzip_flag: index.dump_gzip(f + "_index") else: index.dump(f + "_index") dump_end_time = time.time() print("----------------------------------------------------------") print("dump index:") print(dump_end_time - dump_start_time, "s") print("----------------------------------------------------------") print("finish") print("----------------------------------------------------------") print("Total time analysis:") print(time.time() - starttime, "s") print("Average", (time.time() - starttime) * 1000 / count, "ms") print("DPS", count / (time.time() - starttime), "ps")
def single_version(_parser: Parser): count = 0 start_tiem = time.time() try: # _parser.goto(1) tmp_dir_name = "tmp/" + get_temp_dir_name() print("create tmp index directory:", tmp_dir_name) os.mkdir(tmp_dir_name) # make index while True: count += 1 d = _parser.fetch() if d is not None: c = re.compile("Content-Length: (\d+)\n\n") html_start = c.search(d.content) # result = processing(d.content[html_start.span()[1]+1:]) html = d.content[html_start.span()[1]:] result = processing(html, html_start.span()[1]) result.dump(tmp_dir_name + "/" + str(count)) if count == 1: pass if count % 1000 == 0: print("waiting...", int(count / 1000)) else: break print("----------------------------------------------------------") print("Analysis document:") end_time = time.time() print(end_time - start_tiem, "s") print("Average:", (end_time - start_tiem) * 1000 / count, "ms") print("DPS:", count / (end_time - start_tiem), "ps") print("----------------------------------------------------------") print("build full index ......") start_tiem = time.time() idx = Index() for i in range(1, count): if count % 500 == 0: gc.collect() idx.read_partial_index(i, PartialIndex.read(tmp_dir_name + "/" + str(i))) os.remove(tmp_dir_name + "/" + str(i)) try: os.rmdir(tmp_dir_name) except Exception: pass print("----------------------------------------------------------") print("Build full index:") end_time = time.time() print(end_time - start_tiem, "s") print("Average:", (end_time - start_tiem) * 1000 / count, "ms") print("DPS:", count / (end_time - start_tiem), "ps") print("----------------------------------------------------------") except KeyboardInterrupt: print("Closing worker...") return count return count, idx
__author__ = 'raccoon' import sys import os import gzip from warc.parser import Parser if __name__ == "__main__": file = sys.argv[1] idx_file = file + ".idx" if os.path.isfile(idx_file): print("idx file for", file, " is exist.") quit() with gzip.open(idx_file, "wb") as f: p = Parser(file) while True: r = p.fetch() if not r: break c = r.warc_header["WARC-Record-ID"] + " " + str(r.offset_seek) + "\n" f.write(bytes(c, p.encoding))
__author__ = 'raccoon' import sys import os import gzip from warc.parser import Parser if __name__ == "__main__": file = sys.argv[1] idx_file = file + ".idx" if os.path.isfile(idx_file): print("idx file for", file, " is exist.") quit() with gzip.open(idx_file, "wb") as f: p = Parser(file) while True: r = p.fetch() if not r: break c = r.warc_header["WARC-Record-ID"] + " " + str( r.offset_seek) + "\n" f.write(bytes(c, p.encoding))
def query(): file_name = "" return_count = 10 N = 0 # parse parameters if len(sys.argv) >= 3: if "-w" in sys.argv: file_name = sys.argv[sys.argv.index("-w") + 1] else: usage() if "-r" in sys.argv: return_count = int(sys.argv[sys.argv.index("-r") + 1]) if "-q" in sys.argv: query_string = sys.argv[sys.argv.index("-q") + 1:] else: usage() else: usage() # set idx file and dict file path idx_file = file_name + "_index.idx" dict_file = file_name + "_index.dict" # error detection if not os.path.isfile(idx_file) or not os.path.isfile(idx_file): print("Error: index dictionary file(_index.dict or inverted index file (_index.idx) not found.", file=sys.stderr) exit(1) # count total document N parser = Parser(file_name) while True: if parser.fetch() is not None: N += 1 else: break # read dict file to dict dict_file = open(dict_file) dicts = {} for d in dict_file: (key, offset) = d.split(', ') dicts[key] = int(offset) # term's index term_index = {} # query's parameter table query_table = {} # docs's parameter table docs_table = {} # docs set for merge document docs_set = set() # docs score hash, use cosine similarity score with weight use tf-idf docs_score = {} # Calculate query's weight for term in query_string: if term in dicts: term_index[term] = Index.read_index_by_offset(idx_file, dicts[term]).index[term] # add doc# to set for doc in term_index[term]: docs_set.add(int(doc)) query_table[term] = {} query_table[term]["tf"] = 1 query_table[term]["df"] = len(term_index[term]) query_table[term]["idf"] = math.log(N / query_table[term]["df"], 10) query_table[term]["w"] = (1 + math.log(query_table[term]["tf"])) * query_table[term]["idf"] else: term_index[term] = {} query_table[term] = {} query_table[term]["tf"] = 1 query_table[term]["df"] = 0 query_table[term]["idf"] = 0 query_table[term]["w"] = 0 # Calculate query's weight euclidean_length = 0 while True: try: element = str(docs_set.pop()) except KeyError: break docs_table[element] = {} for term in query_string: docs_table[element][term] = {} docs_table[element][term]["tf"] = 0 if element in term_index[term]: docs_table[element][term]["tf"] = len(term_index[term][str(element)]) euclidean_length += docs_table[element][term]["tf"] * docs_table[element][term]["tf"] euclidean_length = math.sqrt(euclidean_length) for doc in docs_table: for term in query_string: if docs_table[doc][term]["tf"] > 0: docs_table[doc][term]["w"] = (1 + math.log(docs_table[doc][term]["tf"], 10)) * math.log( query_table[term]["df"], 10) else: docs_table[doc][term]["w"] = 0 query_len = 0 for term in query_string: query_len += query_table[term]["w"] * query_table[term]["w"] query_len = math.sqrt(query_len) for doc in docs_table: up_part = 0 doc_len = 0 for terms in query_string: up_part += docs_table[doc][terms]["w"] * query_table[terms]["w"] doc_len += docs_table[doc][terms]["w"] * docs_table[doc][terms]["w"] docs_score[doc] = up_part / (math.sqrt(doc_len) * query_len) print("Query terms:", query_string) print("Top", return_count, "results:") print("doc#\tscore") for i in sorted(docs_score, key=docs_score.get, reverse=True): return_count -= 1 if return_count < 0: break print("%d\t%.3f" % (int(i), docs_score[i]))