def extract_text(output_dir): corpus_input = {} plugin_directory = os.path.abspath(os.path.dirname(__file__)) questions = pandas.read_csv("%s/cnp_739.tsv" % (plugin_directory), sep="\t") for question in questions.iterrows(): corpus_input[question[1].question_label] = { "text": question[1].question_text, "labels": [question[1].question_label], } # Save articles to text files in output folder save_sentences(corpus_input, output_dir=output_dir)
def extract_text(uids,output_dir): # This function will be called by a job, and must call save_sentences # ** ALL USER FUNCTIONS MUST HAVE output_dir as an input # You can provide a dictionary if your items have ids corpus_input = { "uid1":{"text":"One fish, two fish."}, "uid2":{"text":"Nerd fish, wordfish."}, } # If you have labels (to be used for classification corpus_input = { "uid1":{"text":"One fish, two fish.","labels":["counting","poem"]}, "uid2":{"text":"Nerd fish, wordfish.","labels":["poem"]}, } # Or just give raw text in a list corpus_input = ["This is text from article 1 with no id.", "This is text from article 2 with no id."] # Save articles to individual text files in output folder save_sentences(corpus_input,output_dir=output_dir)
features = pandas.read_csv(f,sep="\t") # Prepare dictionary with key [pmid] and value [text] features.index = features.pmid features = features.drop("pmid",axis=1) corpus_input = dict() for pmid,article in articles.iteritems(): # Label the article with nonzero values try: labels = features.columns[features.loc[int(pmid)]!=0].tolist() corpus_input[pmid] = {"text":article.getAbstract(),"labels":labels} except: pass # Save articles to text files in output folder save_sentences(corpus_input,output_dir=output_dir) def extract_terms(output_dir): f,d = download_data() features = pandas.read_csv(f,sep="\t") terms = features.columns.tolist() terms.pop(0) #pmid save_terms(terms,output_dir) def generate_maps(terms,output_dir): f,d = download_data() features = pandas.read_csv(f,sep="\t") database = pandas.read_csv(d,sep="\t")