__author__ = 'Moorthy' import fileutils from BeautifulSoup import BeautifulSoup import json doc_path = "D:\\\ir_a03_bin_mine\\data\\document_inlinks\\" doc_files = fileutils.getfilelist(doc_path) def convert_documents(): for docno in doc_files: print docno document = fileutils.readdocumentwithinlin(docno) current_txt = "" current_txt += "<DOCNO>"+document['page_url']+"</DOCNO>\n" header_txt = ''.join(document['headerdata']) current_txt += "<HTTP_HEADER>\n"+header_txt+"\n</HTTP_HEADER>\n" current_txt += "<HEAD>"+document['page_title']+"</HEAD>\n" current_txt += "<HTML_SOURCE>\n"+document['raw_html']+"\n</HTML_SOURCE>\n" current_txt += "<TEXT>\n"+document['clean_text']+"\n</TEXT>\n" if "outlinks" in document: current_txt += "<OUTLINKS>\n"+",".join(set(document['outlinks']))+"\n</OUTLINKS>\n" else: current_txt += "<OUTLINKS>\n"+",".join([])+"\n</OUTLINKS>\n" if "inlink" in document: current_txt += "<INLINKS>\n"+",".join(set(document['inlink']))+"\n</INLINKS>\n" else: current_txt += "<INLINKS>\n"+",".join([])+"\n</INLINKS>\n" fileutils.write_final_document(docno, current_txt) print "COMPLETED "+docno convert_documents()
__author__ = 'Moorthy' import fileutils from BeautifulSoup import BeautifulSoup doc_path = "D:\\ir_a03_bin_mine\\data\\document\\" formatted_doc_path = "D:\\ir_a03_bin_mine\\data\\document_formatted\\" doc_files = fileutils.getfilelist(doc_path) formatted_doc_files = fileutils.getfilelist(formatted_doc_path) docs_to_format = list(set(doc_files) - set(formatted_doc_files)) def get_head(raw_html): soup = BeautifulSoup(raw_html) return soup.title.string def update_inlink_map(): inlinkmap = {} for docno in docs_to_format: document = fileutils.readdocument(docno) current_page = document['page_url'] for outlink in document['outlinks']: if outlink in inlinkmap: inlinks = inlinkmap[outlink] inlinks.append(current_page) inlinkmap[outlink] = inlinks else: inlinks = [] inlinks.append(current_page)