コード例 #1
0
ファイル: rank.py プロジェクト: yishuwei/infoRetrieval
def parseTrainFile(trainFile):
    train_f = open(trainFile, 'r')
    queries = []
    docs = []
    query2docs = {}

    for line in train_f:
        key = line.split(':', 1)[0].strip()
        value = line.split(':', 1)[1].strip()
        if (key == 'query'):
            query = value
            queries.append(query)
            query2docs[query] = []
        elif (key == 'url'):
            doc = Document(query)
            docs.append(doc)
            query2docs[query].append(doc)
            doc.set_url(value)
        elif (key == 'title'):
            doc.set_title(value)
        elif (key == 'header'):
            doc.add_header(value)
        elif (key == 'body_hits'):
            temp = value.split(' ', 1)
            doc.add_body_hits(temp[0].strip(), map(int,
                                                   temp[1].strip().split()))
        elif (key == 'body_length'):
            doc.set_body_length(int(value))
        elif (key == 'pagerank'):
            doc.set_pagerank(int(value))
        elif (key == 'anchor_text'):
            anchor_text = value
        elif (key == 'stanford_anchor_count'):
            doc.add_anchor(anchor_text, int(value))
    train_f.close()

    doc_count = len(docs)
    avlen_url = sum(doc.url_length for doc in docs) / doc_count
    avlen_title = sum(doc.title_length for doc in docs) / doc_count
    avlen_header = sum(doc.header_length for doc in docs) / doc_count
    avlen_body = sum(doc.body_length for doc in docs) / doc_count
    avlen_anchor = sum(doc.anchor_length for doc in docs) / doc_count

    with open('DocFreqDict', 'rb') as doc_freq_f:
        doc_freq_dict = marshal.load(doc_freq_f)
    for doc in docs:
        doc.set_averages(avlen_url, avlen_title, avlen_header, avlen_body,
                         avlen_anchor)
        doc.make_vectors(doc_freq_dict)

    return queries, query2docs
コード例 #2
0
ファイル: rank.py プロジェクト: yishuwei/infoRetrieval
def parseTrainFile(trainFile):
    train_f = open(trainFile, 'r')
    queries = []
    docs = []
    query2docs = {}

    for line in train_f:
      key = line.split(':', 1)[0].strip()
      value = line.split(':', 1)[1].strip()
      if (key == 'query'):
        query = value
        queries.append(query)
        query2docs[query] = []
      elif (key == 'url'):
        doc = Document(query)
        docs.append(doc)
        query2docs[query].append(doc)
        doc.set_url(value)
      elif (key == 'title'):
        doc.set_title(value)
      elif (key == 'header'):
        doc.add_header(value)
      elif (key == 'body_hits'):
        temp = value.split(' ', 1)
        doc.add_body_hits(temp[0].strip(), map(int, temp[1].strip().split()))
      elif (key == 'body_length'):
        doc.set_body_length(int(value))
      elif (key == 'pagerank'):
        doc.set_pagerank(int(value))
      elif (key == 'anchor_text'):
        anchor_text = value
      elif (key == 'stanford_anchor_count'):
        doc.add_anchor(anchor_text, int(value))
    train_f.close()    
    
    doc_count = len(docs)
    avlen_url = sum(doc.url_length for doc in docs) / doc_count
    avlen_title = sum(doc.title_length for doc in docs) / doc_count
    avlen_header = sum(doc.header_length for doc in docs) / doc_count
    avlen_body = sum(doc.body_length for doc in docs) / doc_count
    avlen_anchor = sum(doc.anchor_length for doc in docs) / doc_count
        
    with open('DocFreqDict', 'rb') as doc_freq_f:
      doc_freq_dict = marshal.load(doc_freq_f)
    for doc in docs:
      doc.set_averages(avlen_url, avlen_title, avlen_header, avlen_body, avlen_anchor)
      doc.make_vectors(doc_freq_dict)

    return queries, docs, query2docs
コード例 #3
0
ファイル: example_document.py プロジェクト: NuCOS/autobasedoc
CAL = "grafics/Kalender.svg"
FN = "grafics/Dateiname.svg"

if __name__ == "__main__":
    ##############################
    # PORTRAIT DOCUMENT
    doc = Document(orientation="portrait")
    footerText = [["left", "left middle", "right middle", "right"]]
    rightHeaderLines = [["top line"], ["middle line"], ["bottom line"]]

    headerText = [["", "MAIN TITLE", ""]]
    header = create_header(headerText, rightHeaderLines, LOGO)
    footer = create_footer(footerText, [USER, CAL, "", FN])

    doc.add_footer(footer)
    doc.add_header(header)
    spacer = create_vertical_spacer(4.8)
    # table
    titleTableText = [["Title", "Title"], ["Subtitle", "Subtitle"],
                      ["Datum", "Datum"]]
    titleTableData = {
        "body": titleTableText,
        "hTableAlignment": "CENTER",
        "colWidths": [8, 8],
        "fontsize": 15,
    }
    table_title = create_table(**titleTableData)
    # create a figure
    fig, leg = create_figure(hAlign=ar.TA_CENTER)
    doc.add_page(bookmark=ar.Bookmark("Main Chapter", 0),
                 framestyle="single",