Esempio n. 1
0
def main():
    input_file = sys.argv[1]

    if "-cf" in sys.argv:
        WarcHTMLParser.case_folding = True
    if "-sw" in sys.argv:
        WarcHTMLParser.stopword_remove = True
    if "-st" in sys.argv:
        WarcHTMLParser.stemming = True

    parser = Parser(input_file)
    # skip header record
    parser.fetch()

    starttime = time.time()

    print("Start......")
    count, index = start_parse(parser)

    print("dump index from memory to file")
    dump_start_time = time.time()
    index.dump(input_file + ".index")
    dump_end_time = time.time()
    print("----------------------------------------------------------")
    print("dump index:")
    print(dump_end_time - dump_start_time, "s")
    print("----------------------------------------------------------")
    print("finish")
    print("----------------------------------------------------------")
    print("Total time analysis:")
    print(time.time() - starttime, "s")
    print("Average", (time.time() - starttime) * 1000 / count, "ms")
    print("DPS", count / (time.time() - starttime), "ps")
Esempio n. 2
0
def multi_version(_parser: Parser) -> int:
    multiprocessing.freeze_support()
    pool = multiprocessing.Pool()
    count = 0
    result = []

    start_time = time.time()
    tmp_dir_name = "tmp/" + get_temp_dir_name()
    print(tmp_dir_name)
    os.mkdir(tmp_dir_name)

    while True:
        d = _parser.fetch()
        count += 1
        if d is not None:
            result.append(pool.apply_async(processing_async, (count, d.content,)))

            if count % 1000 == 0:
                print("waiting...", int(count / 1000))
                pool.close()
                pool.join()
                for r in result:
                    cnt, idx = r.get()
                    idx.dump(tmp_dir_name + "/" + str(cnt))
                pool = multiprocessing.Pool()
                result = []
        else:
            pool.close()
            pool.join()
            for r in result:
                cnt, idx = r.get()
                idx.dump(tmp_dir_name + "/" + str(cnt))
            break
    print("----------------------------------------------------------")
    print("Analysis document:")
    end_time = time.time()
    print(end_time - start_time, "s")
    print("Average:", (end_time - start_time) * 1000 / count, "ms")
    print("Document process per second:", count / (end_time - start_time), "ps")
    print("----------------------------------------------------------")
    print("build full index ......")
    start_time = time.time()
    idx = Index()
    for i in range(1, count):
        if count % 500 == 0:
            gc.collect()
        idx.read_partial_index(i, PartialIndex.read(tmp_dir_name + "/" + str(i)))
        os.remove(tmp_dir_name + "/" + str(i))
    try:
        os.rmdir(tmp_dir_name)
    except Exception:
        pass
    print("----------------------------------------------------------")
    print("Build full index:")
    end_time = time.time()
    print(end_time - start_time, "s")
    print("Average:", (end_time - start_time) * 1000 / count, "ms")
    print("DPS:", count / (end_time - start_time), "ps")
    print("----------------------------------------------------------")
    return count, idx
Esempio n. 3
0
def main():
    f = sys.argv[1]

    multi_flag = False
    gzip_flag = False

    if "-st" in sys.argv:
        WarcHTMLParser.stemming = True
    if "-sw" in sys.argv:
        WarcHTMLParser.stopword_remove = True
    if "-cf" in sys.argv:
        WarcHTMLParser.case_folding = True
    if "-gz" in sys.argv:
        gzip_flag = True
    if "-m" in sys.argv:
        multi_flag = True

    p = Parser(f)
    # skip header record
    p.fetch()

    starttime = time.time()

    print("Start......")
    if multi_flag:
        count, index = multi_version(p)
    else:
        count, index = single_version(p)

    print("dump index from memory to file " + f + ".index.txt")
    dump_start_time = time.time()
    if gzip_flag:
        index.dump_gzip(f + "_index")
    else:
        index.dump(f + "_index")
    dump_end_time = time.time()
    print("----------------------------------------------------------")
    print("dump index:")
    print(dump_end_time - dump_start_time, "s")
    print("----------------------------------------------------------")
    print("finish")
    print("----------------------------------------------------------")
    print("Total time analysis:")
    print(time.time() - starttime, "s")
    print("Average", (time.time() - starttime) * 1000 / count, "ms")
    print("DPS", count / (time.time() - starttime), "ps")
Esempio n. 4
0
def single_version(_parser: Parser):
    count = 0
    start_tiem = time.time()
    try:
        # _parser.goto(1)

        tmp_dir_name = "tmp/" + get_temp_dir_name()
        print("create tmp index directory:", tmp_dir_name)
        os.mkdir(tmp_dir_name)
        # make index
        while True:
            count += 1
            d = _parser.fetch()
            if d is not None:
                c = re.compile("Content-Length: (\d+)\n\n")
                html_start = c.search(d.content)
                # result = processing(d.content[html_start.span()[1]+1:])
                html = d.content[html_start.span()[1]:]
                result = processing(html, html_start.span()[1])
                result.dump(tmp_dir_name + "/" + str(count))

                if count == 1:
                    pass
                if count % 1000 == 0:
                    print("waiting...", int(count / 1000))
            else:
                break
        print("----------------------------------------------------------")
        print("Analysis document:")
        end_time = time.time()
        print(end_time - start_tiem, "s")
        print("Average:", (end_time - start_tiem) * 1000 / count, "ms")
        print("DPS:", count / (end_time - start_tiem), "ps")
        print("----------------------------------------------------------")
        print("build full index ......")
        start_tiem = time.time()
        idx = Index()
        for i in range(1, count):
            if count % 500 == 0:
                gc.collect()
            idx.read_partial_index(i, PartialIndex.read(tmp_dir_name + "/" + str(i)))
            os.remove(tmp_dir_name + "/" + str(i))
        try:
            os.rmdir(tmp_dir_name)
        except Exception:
            pass
        print("----------------------------------------------------------")
        print("Build full index:")
        end_time = time.time()
        print(end_time - start_tiem, "s")
        print("Average:", (end_time - start_tiem) * 1000 / count, "ms")
        print("DPS:", count / (end_time - start_tiem), "ps")
        print("----------------------------------------------------------")
    except KeyboardInterrupt:
        print("Closing worker...")
        return count
    return count, idx
__author__ = 'raccoon'

import sys
import os
import gzip
from warc.parser import Parser

if __name__ == "__main__":
    file = sys.argv[1]
    idx_file = file + ".idx"
    if os.path.isfile(idx_file):
        print("idx file for", file, " is exist.")
        quit()

    with gzip.open(idx_file, "wb") as f:
        p = Parser(file)

        while True:
            r = p.fetch()
            if not r:
                break
            c = r.warc_header["WARC-Record-ID"] + " " + str(r.offset_seek) + "\n"
            f.write(bytes(c, p.encoding))
__author__ = 'raccoon'

import sys
import os
import gzip
from warc.parser import Parser

if __name__ == "__main__":
    file = sys.argv[1]
    idx_file = file + ".idx"
    if os.path.isfile(idx_file):
        print("idx file for", file, " is exist.")
        quit()

    with gzip.open(idx_file, "wb") as f:
        p = Parser(file)

        while True:
            r = p.fetch()
            if not r:
                break
            c = r.warc_header["WARC-Record-ID"] + " " + str(
                r.offset_seek) + "\n"
            f.write(bytes(c, p.encoding))
Esempio n. 7
0
def query():
    file_name = ""
    return_count = 10
    N = 0

    # parse parameters
    if len(sys.argv) >= 3:
        if "-w" in sys.argv:
            file_name = sys.argv[sys.argv.index("-w") + 1]
        else:
            usage()
        if "-r" in sys.argv:
            return_count = int(sys.argv[sys.argv.index("-r") + 1])

        if "-q" in sys.argv:
            query_string = sys.argv[sys.argv.index("-q") + 1:]
        else:
            usage()
    else:
        usage()

    # set idx file and dict file path
    idx_file = file_name + "_index.idx"
    dict_file = file_name + "_index.dict"

    # error detection
    if not os.path.isfile(idx_file) or not os.path.isfile(idx_file):
        print("Error: index dictionary file(_index.dict or inverted index file (_index.idx) not found.",
              file=sys.stderr)
        exit(1)


    # count total document N
    parser = Parser(file_name)
    while True:
        if parser.fetch() is not None:
            N += 1
        else:
            break

    # read dict file to dict
    dict_file = open(dict_file)
    dicts = {}
    for d in dict_file:
        (key, offset) = d.split(', ')
        dicts[key] = int(offset)

    # term's index
    term_index = {}
    # query's parameter table
    query_table = {}
    # docs's parameter table
    docs_table = {}
    # docs set for merge document
    docs_set = set()
    # docs score hash, use cosine similarity score with weight use tf-idf
    docs_score = {}
    # Calculate query's weight
    for term in query_string:
        if term in dicts:
            term_index[term] = Index.read_index_by_offset(idx_file, dicts[term]).index[term]
            # add doc# to set
            for doc in term_index[term]:
                docs_set.add(int(doc))
            query_table[term] = {}
            query_table[term]["tf"] = 1
            query_table[term]["df"] = len(term_index[term])
            query_table[term]["idf"] = math.log(N / query_table[term]["df"], 10)
            query_table[term]["w"] = (1 + math.log(query_table[term]["tf"])) * query_table[term]["idf"]
        else:
            term_index[term] = {}
            query_table[term] = {}
            query_table[term]["tf"] = 1
            query_table[term]["df"] = 0
            query_table[term]["idf"] = 0
            query_table[term]["w"] = 0

    # Calculate query's weight
    euclidean_length = 0
    while True:
        try:
            element = str(docs_set.pop())
        except KeyError:
            break
        docs_table[element] = {}
        for term in query_string:
            docs_table[element][term] = {}
            docs_table[element][term]["tf"] = 0
            if element in term_index[term]:
                docs_table[element][term]["tf"] = len(term_index[term][str(element)])
            euclidean_length += docs_table[element][term]["tf"] * docs_table[element][term]["tf"]
    euclidean_length = math.sqrt(euclidean_length)

    for doc in docs_table:
        for term in query_string:
            if docs_table[doc][term]["tf"] > 0:
                docs_table[doc][term]["w"] = (1 + math.log(docs_table[doc][term]["tf"], 10)) * math.log(
                    query_table[term]["df"], 10)
            else:
                docs_table[doc][term]["w"] = 0

    query_len = 0
    for term in query_string:
        query_len += query_table[term]["w"] * query_table[term]["w"]
    query_len = math.sqrt(query_len)

    for doc in docs_table:
        up_part = 0
        doc_len = 0
        for terms in query_string:
            up_part += docs_table[doc][terms]["w"] * query_table[terms]["w"]
            doc_len += docs_table[doc][terms]["w"] * docs_table[doc][terms]["w"]
        docs_score[doc] = up_part / (math.sqrt(doc_len) * query_len)

    print("Query terms:", query_string)
    print("Top", return_count, "results:")
    print("doc#\tscore")

    for i in sorted(docs_score, key=docs_score.get, reverse=True):
        return_count -= 1
        if return_count < 0:
            break
        print("%d\t%.3f" % (int(i), docs_score[i]))