Beispiel #1
0
def main(q1, q2, articles, max_sentences):
    q1 = q1.lower()
    q2 = q2.lower()
    first_two_proteins = [q1, q2]
    a1_file = make_a1_file_object(first_two_proteins)
    print a1_file.proteins
    PPI_cite_main.index(a1_file, articles, max_sentences)
Beispiel #2
0
def main(size_of_test_set, articles, max_sentences):
    
    path = r'C:\Users\Adam\workspace\Wiki Pi NLP\Test_Set_Files_BIONLP09\dot_a1_files'
    count = 0
    for dir_entry in os.listdir(path):
        count += 1
        if count > size_of_test_set:
            break
        dir_entry_path = os.path.join(path, dir_entry)
        if os.path.isfile(dir_entry_path):
            with open(dir_entry_path, 'r') as my_file:
                reader=csv.reader(my_file,delimiter='\t')
                rows = []
                for row in reader:
                    rows.append(row)
                first_two_proteins = []
                for lst in rows[:2]:
                    first_two_proteins.append(lst[2])
                if len(first_two_proteins) != 2:
                    continue
                first_two_proteins = [x.lower() for x in first_two_proteins]
                #first_two_proteins = [x.replace('-',' ') for x in first_two_proteins]
                if first_two_proteins[0] == first_two_proteins[1]:
                    continue
                a1_file = make_a1_file_object(my_file, dir_entry,first_two_proteins)

            print a1_file.proteins, count
            PPI_cite_main.index(a1_file, articles, max_sentences) 
Beispiel #3
0
def main(q1, q2, articles, max_sentences):
    q1= q1.lower()
    q2 = q2.lower()
    first_two_proteins = [q1, q2]
    a1_file = make_a1_file_object(first_two_proteins)
    print a1_file.proteins
    PPI_cite_main.index(a1_file, articles, max_sentences) 
Beispiel #4
0
def sent_with_cooccur(
        ID_paper_obj_dict,
        query):  # returns a list of all sentences that contain both queries

    ID_sentence_lists = []
    for key in iter(ID_paper_obj_dict):
        if not ID_paper_obj_dict[key].all_sentences:
            ID_paper_obj_dict[key].split_abstract_into_sentences(query)
            ID_paper_obj_dict[key].word_tokenize()
            coocurrence_list = ID_paper_obj_dict[
                key].find_sentences_with_both_queries(
                    ID_paper_obj_dict[key].all_sentences, query)
            sentence_list = ID_paper_obj_dict[key].make_sentence_id_tuples(
                coocurrence_list)
            ID_sentence_lists.extend(sentence_list)
        else:
            coocurrence_list = ID_paper_obj_dict[
                key].find_sentences_with_both_queries(
                    ID_paper_obj_dict[key].all_sentences, query)
            sentence_list = ID_paper_obj_dict[key].make_sentence_id_tuples(
                coocurrence_list)
            ID_sentence_lists.extend(sentence_list)

    if not ID_sentence_lists:
        print "No sentences with co-occurance found"
        time.sleep(3)
        PPI_cite_main.no_cooc_sent("No sentences with co-occurance found",
                                   None, None, None, None, None)
        return ID_sentence_lists
    else:
        return ID_sentence_lists
Beispiel #5
0
def main(size_of_test_set, articles, max_sentences):

    path = r'C:\Users\Adam\workspace\Wiki Pi NLP\Test_Set_Files_BIONLP09\dot_a1_files'
    count = 0
    for dir_entry in os.listdir(path):
        count += 1
        if count > size_of_test_set:
            break
        dir_entry_path = os.path.join(path, dir_entry)
        if os.path.isfile(dir_entry_path):
            with open(dir_entry_path, 'r') as my_file:
                reader = csv.reader(my_file, delimiter='\t')
                rows = []
                for row in reader:
                    rows.append(row)
                first_two_proteins = []
                for lst in rows[:2]:
                    first_two_proteins.append(lst[2])
                if len(first_two_proteins) != 2:
                    continue
                first_two_proteins = [x.lower() for x in first_two_proteins]
                #first_two_proteins = [x.replace('-',' ') for x in first_two_proteins]
                if first_two_proteins[0] == first_two_proteins[1]:
                    continue
                a1_file = make_a1_file_object(my_file, dir_entry,
                                              first_two_proteins)

            print a1_file.proteins, count
            PPI_cite_main.index(a1_file, articles, max_sentences)
Beispiel #6
0
def sent_with_cooccur(ID_paper_obj_dict, query):  # returns a list of all sentences that contain both queries
    
    ID_sentence_lists = []
    for key in iter(ID_paper_obj_dict):
        if not ID_paper_obj_dict[key].all_sentences:
            ID_paper_obj_dict[key].split_abstract_into_sentences(query)
            ID_paper_obj_dict[key].word_tokenize()
            coocurrence_list = ID_paper_obj_dict[key].find_sentences_with_both_queries(ID_paper_obj_dict[key].all_sentences, query)  
            sentence_list = ID_paper_obj_dict[key].make_sentence_id_tuples(coocurrence_list)      
            ID_sentence_lists.extend(sentence_list)
        else: 
            coocurrence_list = ID_paper_obj_dict[key].find_sentences_with_both_queries(ID_paper_obj_dict[key].all_sentences, query)
            sentence_list = ID_paper_obj_dict[key].make_sentence_id_tuples(coocurrence_list)
            ID_sentence_lists.extend(sentence_list)
             
    if not ID_sentence_lists:
        print "No sentences with co-occurance found"
        time.sleep(3)
        PPI_cite_main.no_cooc_sent("No sentences with co-occurance found", None, None, None, None, None)
        return ID_sentence_lists
    else:
        return ID_sentence_lists
Beispiel #7
0
def get_ID_list(xml):
    try:
        root = ET.fromstring(xml)
        ID_List_ofElements = root.findall("./IdList/Id")
        ids = []
        for element in ID_List_ofElements:
            singleID_string = ET.tostring(element, method='text')
            singleID_string_stripped = singleID_string.replace("\n", "")
            ids.append(singleID_string_stripped)
    except AttributeError:
        ids = []
        print("No Papers with both queries were found on PubMed")
        PPI_cite_main.no_papers_with_queries("No Papers with both queries were found on PubMed", None, None, None, None, None)
        

    existing_papers = []  # Use this in the future to make database of existing IDs 
    papers_to_download = []
    for ind_id in ids:
        papers_to_download.append(ind_id)

    full_ID_List = {"existing_papers":existing_papers,
                                    "papers_to_download":papers_to_download}
    return full_ID_List