def get_affiliation_topics(topics_func): # EGC data egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt")) # Topics dict_topic_top_words, dict_doc_top_topics, dict_topic_top_docs = topics_func() # Affiliations dict_author_affiliations = load_ciprian_affiliations("../../ciprian/output/authors_email_affiliation.csv") dict_doc_affiliations = load_doc_affiliations(egc_df) dict_affiliation_top_topics = defaultdict(list) non_mails = 0 for doc_id, topics_ids in dict_doc_top_topics.iteritems(): doc_affiliations = dict_doc_affiliations[doc_id] if not doc_affiliations: doc_authors = [ a.lstrip().strip() for a in egc_df[egc_df["id"] == int(doc_id)]["authors"].values[0].lower().split(",") ] doc_affiliations = get_document_affiliation(dict_author_affiliations, doc_authors) if not doc_affiliations: non_mails += 1 continue doc_topics = dict_doc_top_topics[doc_id] for affi in doc_affiliations: dict_affiliation_top_topics[affi].append(doc_topics) print "There are {0} docs from {1} with no emails available".format(non_mails, len(dict_doc_top_topics)) dict_topics_top_affiliations = {} for topic_id, top_docs in dict_topic_top_docs.iteritems(): temp_affiliations = [] for doc_id in top_docs: doc_affiliations = dict_doc_affiliations[doc_id] if not doc_affiliations: doc_authors = [ a.lstrip().strip() for a in egc_df[egc_df["id"] == int(doc_id)]["authors"].values[0].lower().split(",") ] doc_affiliations = get_document_affiliation(dict_author_affiliations, doc_authors) if not doc_affiliations: continue temp_affiliations.extend(doc_affiliations) dict_topics_top_affiliations[topic_id] = Counter(temp_affiliations).most_common(5) pass
def get_affiliation_topics(topics_func): # EGC data egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt")) # Topics dict_topic_top_words, dict_doc_top_topics, dict_topic_top_docs = topics_func() # Affiliations dict_author_affiliations = load_ciprian_affiliations("../../ciprian/output/authors_email_affiliation.csv") dict_doc_affiliations = load_doc_affiliations(egc_df) dict_affiliation_top_topics = defaultdict(list) non_mails = 0 for doc_id, topics_ids in dict_doc_top_topics.iteritems(): doc_affiliations = dict_doc_affiliations[doc_id] if not doc_affiliations: doc_authors = [a.lstrip().strip() for a in egc_df[egc_df["id"] == int(doc_id)]["authors"].values[0].lower().split(",")] doc_affiliations = get_document_affiliation(dict_author_affiliations, doc_authors) if not doc_affiliations: non_mails += 1 continue doc_topics = dict_doc_top_topics[doc_id] for affi in doc_affiliations: dict_affiliation_top_topics[affi].append(doc_topics) print "There are {0} docs from {1} with no emails available".format(non_mails, len(dict_doc_top_topics)) dict_topics_top_affiliations = {} for topic_id, top_docs in dict_topic_top_docs.iteritems(): temp_affiliations = [] for doc_id in top_docs: doc_affiliations = dict_doc_affiliations[doc_id] if not doc_affiliations: doc_authors = [a.lstrip().strip() for a in egc_df[egc_df["id"] == int(doc_id)]["authors"].values[0].lower().split(",")] doc_affiliations = get_document_affiliation(dict_author_affiliations, doc_authors) if not doc_affiliations: continue temp_affiliations.extend(doc_affiliations) dict_topics_top_affiliations[topic_id] = Counter(temp_affiliations).most_common(5) pass
break # finally, if we found one of the possible location markers, we keep the line findo = re.findall(possible_location, l) if findo: locaion_list.append(l) return locaion_list def treat_asteriks(text_list, asterisks_idx): asterisks_idx = np.array(asterisks_idx) text_list = np.array(text_list) lines_with_asterisk = [re.sub(ur"[\u2217\*]+", "", l, re.UNICODE) for l in text_list[asterisks_idx]] return lines_with_asterisk egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt")) one_pages = load_text_data("../../input/pdfs/1page/", "txt") locations_dict = defaultdict(list) for page_idx, doc in one_pages.iteritems(): authors = [n.lower() for n in egc_df[egc_df["id"] == int(page_idx)]["authors"].values[0].split(",")] doc_split = [l for l in doc.split("\n") if len(l.strip()) > 2] first_lines = [l for l in doc_split[2:10]] text_lines = "\n".join(first_lines) # if u"*" in text_lines or u"\u2217" in text_lines: asterisk_idx = [True if l[0] in asterisks else False for l in first_lines] if any(asterisk_idx): info_location = treat_asteriks(first_lines, asterisk_idx) # print info_location # print # print text_lines # print "***"*80
def get_abstract_and_title_topics(): egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt")) egc_df["title+abstract"] = egc_df["title"] + " " + egc_df["abstract"].fillna("") data = dict(zip(egc_df["id"].tolist(), egc_df["title+abstract"].tolist())) return nmf_clustering(data, k=15)
if findo: locaion_list.append(l) return locaion_list def treat_asteriks(text_list, asterisks_idx): asterisks_idx = np.array(asterisks_idx) text_list = np.array(text_list) lines_with_asterisk = [ re.sub(ur"[\u2217\*]+", "", l, re.UNICODE) for l in text_list[asterisks_idx] ] return lines_with_asterisk egc_df = get_EGC_articles( load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt")) one_pages = load_text_data("../../input/pdfs/1page/", "txt") locations_dict = defaultdict(list) for page_idx, doc in one_pages.iteritems(): authors = [ n.lower() for n in egc_df[egc_df["id"] == int(page_idx)] ["authors"].values[0].split(",") ] doc_split = [l for l in doc.split("\n") if len(l.strip()) > 2] first_lines = [l for l in doc_split[2:10]] text_lines = "\n".join(first_lines) # if u"*" in text_lines or u"\u2217" in text_lines: asterisk_idx = [ True if l[0] in asterisks else False for l in first_lines ] if any(asterisk_idx):