for line in url_file.readlines(): line = line.split('\t') summary = line[0] docId = int(line[1]) url = Url() url.m_summary=summary url.m_docId=docId url_list.append(url) for line in doc_file.readlines(): line = line.split('\t') docId = int(line[0]) pos = int(line[1]) document = Document() document.m_docId = docId document.m_pos = pos doc_list.append(document) index_dict = {} title_index_dict = {} output_file = open(DATA_PATH + "rawlist.data.segment",'w') for docId in range(0,len(url_list) - 1): length = doc_list[docId+1].m_pos - doc_list[docId].m_pos - 1; if length < 0: break raw_file.seek(doc_list[docId].m_pos) content = raw_file.read(length) head_start = content.find('version:') head_end = content.find('<!DOCTYPE') head = content[head_start:head_end] url = ""