def parseTrainFile(trainFile): train_f = open(trainFile, 'r') queries = [] docs = [] query2docs = {} for line in train_f: key = line.split(':', 1)[0].strip() value = line.split(':', 1)[1].strip() if (key == 'query'): query = value queries.append(query) query2docs[query] = [] elif (key == 'url'): doc = Document(query) docs.append(doc) query2docs[query].append(doc) doc.set_url(value) elif (key == 'title'): doc.set_title(value) elif (key == 'header'): doc.add_header(value) elif (key == 'body_hits'): temp = value.split(' ', 1) doc.add_body_hits(temp[0].strip(), map(int, temp[1].strip().split())) elif (key == 'body_length'): doc.set_body_length(int(value)) elif (key == 'pagerank'): doc.set_pagerank(int(value)) elif (key == 'anchor_text'): anchor_text = value elif (key == 'stanford_anchor_count'): doc.add_anchor(anchor_text, int(value)) train_f.close() doc_count = len(docs) avlen_url = sum(doc.url_length for doc in docs) / doc_count avlen_title = sum(doc.title_length for doc in docs) / doc_count avlen_header = sum(doc.header_length for doc in docs) / doc_count avlen_body = sum(doc.body_length for doc in docs) / doc_count avlen_anchor = sum(doc.anchor_length for doc in docs) / doc_count with open('DocFreqDict', 'rb') as doc_freq_f: doc_freq_dict = marshal.load(doc_freq_f) for doc in docs: doc.set_averages(avlen_url, avlen_title, avlen_header, avlen_body, avlen_anchor) doc.make_vectors(doc_freq_dict) return queries, query2docs
def parseTrainFile(trainFile): train_f = open(trainFile, 'r') queries = [] docs = [] query2docs = {} for line in train_f: key = line.split(':', 1)[0].strip() value = line.split(':', 1)[1].strip() if (key == 'query'): query = value queries.append(query) query2docs[query] = [] elif (key == 'url'): doc = Document(query) docs.append(doc) query2docs[query].append(doc) doc.set_url(value) elif (key == 'title'): doc.set_title(value) elif (key == 'header'): doc.add_header(value) elif (key == 'body_hits'): temp = value.split(' ', 1) doc.add_body_hits(temp[0].strip(), map(int, temp[1].strip().split())) elif (key == 'body_length'): doc.set_body_length(int(value)) elif (key == 'pagerank'): doc.set_pagerank(int(value)) elif (key == 'anchor_text'): anchor_text = value elif (key == 'stanford_anchor_count'): doc.add_anchor(anchor_text, int(value)) train_f.close() doc_count = len(docs) avlen_url = sum(doc.url_length for doc in docs) / doc_count avlen_title = sum(doc.title_length for doc in docs) / doc_count avlen_header = sum(doc.header_length for doc in docs) / doc_count avlen_body = sum(doc.body_length for doc in docs) / doc_count avlen_anchor = sum(doc.anchor_length for doc in docs) / doc_count with open('DocFreqDict', 'rb') as doc_freq_f: doc_freq_dict = marshal.load(doc_freq_f) for doc in docs: doc.set_averages(avlen_url, avlen_title, avlen_header, avlen_body, avlen_anchor) doc.make_vectors(doc_freq_dict) return queries, docs, query2docs
CAL = "grafics/Kalender.svg" FN = "grafics/Dateiname.svg" if __name__ == "__main__": ############################## # PORTRAIT DOCUMENT doc = Document(orientation="portrait") footerText = [["left", "left middle", "right middle", "right"]] rightHeaderLines = [["top line"], ["middle line"], ["bottom line"]] headerText = [["", "MAIN TITLE", ""]] header = create_header(headerText, rightHeaderLines, LOGO) footer = create_footer(footerText, [USER, CAL, "", FN]) doc.add_footer(footer) doc.add_header(header) spacer = create_vertical_spacer(4.8) # table titleTableText = [["Title", "Title"], ["Subtitle", "Subtitle"], ["Datum", "Datum"]] titleTableData = { "body": titleTableText, "hTableAlignment": "CENTER", "colWidths": [8, 8], "fontsize": 15, } table_title = create_table(**titleTableData) # create a figure fig, leg = create_figure(hAlign=ar.TA_CENTER) doc.add_page(bookmark=ar.Bookmark("Main Chapter", 0), framestyle="single",