def get_affiliation_topics(topics_func):
    # EGC data
    egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt"))
    # Topics
    dict_topic_top_words, dict_doc_top_topics, dict_topic_top_docs = topics_func()
    # Affiliations
    dict_author_affiliations = load_ciprian_affiliations("../../ciprian/output/authors_email_affiliation.csv")
    dict_doc_affiliations = load_doc_affiliations(egc_df)

    dict_affiliation_top_topics = defaultdict(list)
    non_mails = 0
    for doc_id, topics_ids in dict_doc_top_topics.iteritems():
        doc_affiliations = dict_doc_affiliations[doc_id]
        if not doc_affiliations:
            doc_authors = [
                a.lstrip().strip() for a in egc_df[egc_df["id"] == int(doc_id)]["authors"].values[0].lower().split(",")
            ]
            doc_affiliations = get_document_affiliation(dict_author_affiliations, doc_authors)
        if not doc_affiliations:
            non_mails += 1
            continue
        doc_topics = dict_doc_top_topics[doc_id]

        for affi in doc_affiliations:
            dict_affiliation_top_topics[affi].append(doc_topics)
    print "There are {0} docs from {1} with no emails available".format(non_mails, len(dict_doc_top_topics))

    dict_topics_top_affiliations = {}
    for topic_id, top_docs in dict_topic_top_docs.iteritems():
        temp_affiliations = []
        for doc_id in top_docs:
            doc_affiliations = dict_doc_affiliations[doc_id]
            if not doc_affiliations:
                doc_authors = [
                    a.lstrip().strip()
                    for a in egc_df[egc_df["id"] == int(doc_id)]["authors"].values[0].lower().split(",")
                ]
                doc_affiliations = get_document_affiliation(dict_author_affiliations, doc_authors)
            if not doc_affiliations:
                continue
            temp_affiliations.extend(doc_affiliations)
        dict_topics_top_affiliations[topic_id] = Counter(temp_affiliations).most_common(5)

    pass
def get_affiliation_topics(topics_func):
    # EGC data
    egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt"))
    # Topics
    dict_topic_top_words, dict_doc_top_topics, dict_topic_top_docs = topics_func()
    # Affiliations
    dict_author_affiliations = load_ciprian_affiliations("../../ciprian/output/authors_email_affiliation.csv")
    dict_doc_affiliations = load_doc_affiliations(egc_df)

    dict_affiliation_top_topics = defaultdict(list)
    non_mails = 0
    for doc_id, topics_ids in dict_doc_top_topics.iteritems():
        doc_affiliations = dict_doc_affiliations[doc_id]
        if not doc_affiliations:
            doc_authors = [a.lstrip().strip() for a in
                           egc_df[egc_df["id"] == int(doc_id)]["authors"].values[0].lower().split(",")]
            doc_affiliations = get_document_affiliation(dict_author_affiliations, doc_authors)
        if not doc_affiliations:
            non_mails += 1
            continue
        doc_topics = dict_doc_top_topics[doc_id]

        for affi in doc_affiliations:
            dict_affiliation_top_topics[affi].append(doc_topics)
    print "There are {0} docs from {1} with no emails available".format(non_mails, len(dict_doc_top_topics))

    dict_topics_top_affiliations = {}
    for topic_id, top_docs in dict_topic_top_docs.iteritems():
        temp_affiliations = []
        for doc_id in top_docs:
            doc_affiliations = dict_doc_affiliations[doc_id]
            if not doc_affiliations:
                doc_authors = [a.lstrip().strip()
                               for a in egc_df[egc_df["id"] == int(doc_id)]["authors"].values[0].lower().split(",")]
                doc_affiliations = get_document_affiliation(dict_author_affiliations, doc_authors)
            if not doc_affiliations:
                continue
            temp_affiliations.extend(doc_affiliations)
        dict_topics_top_affiliations[topic_id] = Counter(temp_affiliations).most_common(5)

    pass
                break
            # finally, if we found one of the possible location markers, we keep the line
            findo = re.findall(possible_location, l)
            if findo:
                locaion_list.append(l)
        return locaion_list

    def treat_asteriks(text_list, asterisks_idx):

        asterisks_idx = np.array(asterisks_idx)
        text_list = np.array(text_list)
        lines_with_asterisk = [re.sub(ur"[\u2217\*]+", "", l, re.UNICODE) for l in text_list[asterisks_idx]]

        return lines_with_asterisk

    egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt"))
    one_pages = load_text_data("../../input/pdfs/1page/", "txt")
    locations_dict = defaultdict(list)
    for page_idx, doc in one_pages.iteritems():
        authors = [n.lower() for n in egc_df[egc_df["id"] == int(page_idx)]["authors"].values[0].split(",")]
        doc_split = [l for l in doc.split("\n") if len(l.strip()) > 2]
        first_lines = [l for l in doc_split[2:10]]
        text_lines = "\n".join(first_lines)
        # if u"*" in text_lines or u"\u2217" in text_lines:
        asterisk_idx = [True if l[0] in asterisks else False for l in first_lines]
        if any(asterisk_idx):
            info_location = treat_asteriks(first_lines, asterisk_idx)
            # print info_location
            # print
            # print text_lines
            # print "***"*80
def get_abstract_and_title_topics():
    egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt"))

    egc_df["title+abstract"] = egc_df["title"] + " " + egc_df["abstract"].fillna("")
    data = dict(zip(egc_df["id"].tolist(), egc_df["title+abstract"].tolist()))
    return nmf_clustering(data, k=15)
            if findo:
                locaion_list.append(l)
        return locaion_list

    def treat_asteriks(text_list, asterisks_idx):

        asterisks_idx = np.array(asterisks_idx)
        text_list = np.array(text_list)
        lines_with_asterisk = [
            re.sub(ur"[\u2217\*]+", "", l, re.UNICODE)
            for l in text_list[asterisks_idx]
        ]

        return lines_with_asterisk

    egc_df = get_EGC_articles(
        load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt"))
    one_pages = load_text_data("../../input/pdfs/1page/", "txt")
    locations_dict = defaultdict(list)
    for page_idx, doc in one_pages.iteritems():
        authors = [
            n.lower() for n in egc_df[egc_df["id"] == int(page_idx)]
            ["authors"].values[0].split(",")
        ]
        doc_split = [l for l in doc.split("\n") if len(l.strip()) > 2]
        first_lines = [l for l in doc_split[2:10]]
        text_lines = "\n".join(first_lines)
        # if u"*" in text_lines or u"\u2217" in text_lines:
        asterisk_idx = [
            True if l[0] in asterisks else False for l in first_lines
        ]
        if any(asterisk_idx):
def get_abstract_and_title_topics():
    egc_df = get_EGC_articles(load_data_egc("../../input/RNTI_articles_export_fixed1347_ids.txt"))

    egc_df["title+abstract"] = egc_df["title"] + " " + egc_df["abstract"].fillna("")
    data = dict(zip(egc_df["id"].tolist(), egc_df["title+abstract"].tolist()))
    return nmf_clustering(data, k=15)