Beispiel #1
0
    for line in url_file.readlines():
        line = line.split('\t')
        summary = line[0]
        docId = int(line[1])
        url = Url()
        url.m_summary=summary
        url.m_docId=docId
        url_list.append(url)
    
    for line in doc_file.readlines():
        line = line.split('\t')
        docId = int(line[0])
        pos = int(line[1])
        document = Document()
        document.m_docId = docId
        document.m_pos = pos
        doc_list.append(document)

    index_dict = {}
    title_index_dict = {}
    output_file = open(DATA_PATH + "rawlist.data.segment",'w')
    for docId in range(0,len(url_list) - 1):
        length = doc_list[docId+1].m_pos - doc_list[docId].m_pos - 1;
        if length < 0:
            break
        raw_file.seek(doc_list[docId].m_pos)
        content = raw_file.read(length)
        head_start = content.find('version:')
        head_end = content.find('<!DOCTYPE')
        head = content[head_start:head_end]