コード例 #1
0
__author__ = 'Moorthy'
import fileutils
from BeautifulSoup import BeautifulSoup
import json

doc_path = "D:\\\ir_a03_bin_mine\\data\\document_inlinks\\"
doc_files = fileutils.getfilelist(doc_path)


def convert_documents():
    for docno in doc_files:
        print docno
        document = fileutils.readdocumentwithinlin(docno)
        current_txt = ""
        current_txt += "<DOCNO>"+document['page_url']+"</DOCNO>\n"
        header_txt = ''.join(document['headerdata'])
        current_txt += "<HTTP_HEADER>\n"+header_txt+"\n</HTTP_HEADER>\n"
        current_txt += "<HEAD>"+document['page_title']+"</HEAD>\n"
        current_txt += "<HTML_SOURCE>\n"+document['raw_html']+"\n</HTML_SOURCE>\n"
        current_txt += "<TEXT>\n"+document['clean_text']+"\n</TEXT>\n"
        if "outlinks" in document:
            current_txt += "<OUTLINKS>\n"+",".join(set(document['outlinks']))+"\n</OUTLINKS>\n"
        else:
            current_txt += "<OUTLINKS>\n"+",".join([])+"\n</OUTLINKS>\n"
        if "inlink" in document:
            current_txt += "<INLINKS>\n"+",".join(set(document['inlink']))+"\n</INLINKS>\n"
        else:
            current_txt += "<INLINKS>\n"+",".join([])+"\n</INLINKS>\n"
        fileutils.write_final_document(docno, current_txt)
        print "COMPLETED "+docno
convert_documents()
コード例 #2
0
__author__ = 'Moorthy'

import fileutils
from BeautifulSoup import BeautifulSoup

doc_path = "D:\\ir_a03_bin_mine\\data\\document\\"
formatted_doc_path = "D:\\ir_a03_bin_mine\\data\\document_formatted\\"


doc_files = fileutils.getfilelist(doc_path)
formatted_doc_files = fileutils.getfilelist(formatted_doc_path)

docs_to_format = list(set(doc_files) - set(formatted_doc_files))

def get_head(raw_html):
    soup = BeautifulSoup(raw_html)
    return soup.title.string

def update_inlink_map():
    inlinkmap = {}
    for docno in docs_to_format:
        document = fileutils.readdocument(docno)
        current_page = document['page_url']
        for outlink in document['outlinks']:
            if outlink in inlinkmap:
                inlinks = inlinkmap[outlink]
                inlinks.append(current_page)
                inlinkmap[outlink] = inlinks
            else:
                inlinks = []
                inlinks.append(current_page)