def find_tagged_NPs(chunked_text): """Find tagged phrases in Switchboard.""" noun_phrases = re.findall(r"\[.*?\]", chunked_text) chunked_nps = [] for np in noun_phrases: new_np = np.replace("[", "").replace("]", "").strip() constituents = new_np.split(" ") chunked_nps.append([tuple(c.split("/")) for c in constituents]) return chunked_nps
def assign_word_spans(self, ent): chunk_spans = [] seen_chunks = [] token_spans = self.data()['parse']['noun_phrase_chunks']['token_spans'] doc = self.data( )['parse']['noun_phrase_chunks']['aligned_description'].lower() noun_phrases_w_spans = self.data( )['parse']['noun_phrase_chunks']['named_chunks'] # for np in noun_phrases_w_spans: np = ent.data()['entityLabel'].lower().strip() char_spans = [(m.start(), m.end() - 1) for m in re.finditer(np + '\s|' + np + '\.', doc)] if not char_spans: np = sanitize_text(np) for punct in [',', '-', '/', '.']: if punct not in np: continue if punct not in doc or punct == np[-1]: if punct + ' ' in np or punct == np[-1]: np = np.replace(punct, '') else: np = np.replace(punct, ' ') char_spans = [(m.start(), m.end() - 1) for m in re.finditer(np + '\s|' + np + '\.', doc)] # print(char_spans) # print(np, doc) occ_n = seen_chunks.count(np) start, end = char_spans[occ_n] start_w, end_w = None, None for w_idx, token_span in enumerate(token_spans): token, ts, te = token_span if ts == start: start_w = w_idx if te == end: end_w = w_idx + 1 if type(start_w) == int and type(end_w) == int: chunk_spans.append([start_w, end_w]) else: # print(np) # print('failed') raise IndexError np_pieces = np.split() seen_chunks += list(set(np_pieces).union(set([np]))) return chunk_spans[-1]
def parse_np(index): np = '' closing = 0 for elem in tree[index:]: if elem[0] == '(': closing += 1 else: match = re.findall("\)", elem) np += elem.replace(')', '').strip() + ' ' closing -= len(match) if closing <= 0: break return np.replace('-LRB- ', '(').replace(' -RRB-', ')').replace('-LRB-', '(').replace('-RRB-', ')').strip().lower()
def mc_tests(ID, dir_name, outdir="MC_TESTS", bestfit_loc="MODELS/"): ''' ID is the Galaxy name dir_name is PropID_GalaxyName Will put mc tests in MC_TESTS/*/dirname ''' # Make new folders folders = ['PARS/', 'INPUT/', 'OUTPUT/'] outdir = check_mkdir(outdir) dummy = [check_mkdir(os.path.join(outdir, folder)) for folder in folders] this_outdir = check_mkdir(os.path.join(outdir, folder, dir_name)) # A place to write the commands if this needs to be run again cmds_out = check_mkdir(os.path.join(outdir, 'MC_COMMANDS/')) cmds_file = open('%s/%s_MC_commands.dat' % (cmds_out, ID), 'r') # Place for the output table table_out = check_mkdir(os.path.join(outdir, 'TABLES/')) out = open('%s/%s_MC.dat' % (table_out, ID), 'w') out.write( '# mc_ID p_value NRGB_data NAGB_data NRGB_model NAGB_model mass_model N_wind Flux1_wind Flux2_wind\n' ) #best fits: bfs[0] = pars for run_trilegal, bfs[1] = input for trilegal bfs[2] = output for trilegal bfs = [ get_afile(bestfit_loc + folder, '*' + ID + '*')[0] for folder in folders ] # find model from file!!! This is not general! ! model = bfs[-1].split('model_')[-1].replace('.dat.dat', '.dat') # Switch to the MC test directories new_place = [ bf.replace(bestfit_loc, 'MC_TESTS/').replace(folder, folder + dir_name + '/') for bf in bfs ] # Best fit pars is the template, we'll swap out sfh file in_pars = open(bfs[0]).readlines() # Load SFHs sfhs = get_afile(mc_dir + '/', 'SFR*mc*dat') for i, sfh in enumerate(sfhs): mcid = sfh.split('/')[-1].split('.')[1] # need to change jason/me new_names = [ np.replace(ext, '.' + mcid + ext) for np, ext in zip(new_place, ['.pars', '.dat', '.dat.dat']) ] # New pars file for run_trilgal.py pfile = open(new_names[0], 'w') [pfile.write(inpar) for inpar in in_pars[:-1]] #sfh line is at the bottom. pfile.write("%-18s %s\n" % ('object_sfr ', sfh)) pfile.close() cmd = "/Users/Phil/research/Italy/WXTRILEGAL/run_trilegal.py " #cmd="/home/philrose/WXTRILEGAL/run_trilegal.py " cmd += "-e code_2.0/main " cmd += "-a " cmd += "-i %s " % new_names[1] cmd += "-o %s " % new_names[2] cmd += "-f %s " % model cmd += new_names[0] cmds_file.write('%s \n' % cmd) print 'running TRILEGAL:', model, ID p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) p = subprocess.Popen(cmd, shell=True) sts = os.waitpid(p.pid, 0)[1] fak_file = get_fakFILE(ID, jason=jason) ast_file = new_names[2].split('/')[0] + '/ast_' + new_names[2].split( '/')[-1] cmd = "AST/code/spread_angst <<EOF \n" cmd += fak_file + "\n" cmd += new_names[2] + "\n" cmd += ast_file + "\n" cmd += "EOF \n" print " ... completeness using %s" % fak_file print " %s -> %s" % (new_names[2], ast_file) print 'Running spread_angst...' p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) sts = os.waitpid(p.pid, 0)[1] os.system("wc -l %s %s|head -2" % (new_names[2], ast_file)) cmds_file.write('%s \n' % cmd) synthcmd = read_table(ast_file) s_mag2 = synthcmd.get_col('mag2') + synthcmd.get_col( 'diff_mag2'.strip()) s_mag2 = s_mag2[np.nonzero( abs(synthcmd.get_col('diff_mag2'.strip())) < 90.)[0]] s_mag1 = synthcmd.get_col('mag1') + synthcmd.get_col( 'diff_mag1'.strip()) s_mag1 = s_mag1[np.nonzero( abs(synthcmd.get_col('diff_mag1'.strip())) < 90.)[0]] s_color = s_mag1 - s_mag2 Norm = trgb + 1.5 ind, nB_AGB, nNorm, ps_nNorm, ps_nB_AGBm, hist, bins, s_hist_normed, p_value, normalization = calc_LF( mag2, s_mag2, Norm, trgb) Nstars, flux_rates = flux_from_mass_loss(synthcmd, rates, [filt1, filt2], ast_inds=ind, rel_flux=True) out.write( '# mc_ID p_value NRGB_data NAGB_data NRGB_model NAGB_model mass_model N_wind Flux1_wind Flux2_wind\n' ) out.write('%s %.3f %i %i %i %i %e %i %e %e \n' % (mcid, p_value, nNorm, nB_AGB, ps_nNorm, ps_nB_AGBm, object_mass, Nstars[0], flux_rates[0][1], flux_rates[1][0])) os.remove(ast_file) print 'deleted', ast_file os.remove(new_names[2]) print 'deleted', new_names[2] out.close() cmds_file.close()
def mc_tests(ID,dir_name,outdir="MC_TESTS",bestfit_loc = "MODELS/"): ''' ID is the Galaxy name dir_name is PropID_GalaxyName Will put mc tests in MC_TESTS/*/dirname ''' # Make new folders folders = ['PARS/','INPUT/','OUTPUT/'] outdir = check_mkdir(outdir) dummy = [check_mkdir(os.path.join(outdir,folder)) for folder in folders] this_outdir = check_mkdir(os.path.join(outdir,folder,dir_name)) # A place to write the commands if this needs to be run again cmds_out = check_mkdir(os.path.join(outdir,'MC_COMMANDS/')) cmds_file = open('%s/%s_MC_commands.dat'%(cmds_out,ID),'r') # Place for the output table table_out = check_mkdir(os.path.join(outdir,'TABLES/')) out = open('%s/%s_MC.dat'%(table_out,ID),'w') out.write('# mc_ID p_value NRGB_data NAGB_data NRGB_model NAGB_model mass_model N_wind Flux1_wind Flux2_wind\n') #best fits: bfs[0] = pars for run_trilegal, bfs[1] = input for trilegal bfs[2] = output for trilegal bfs = [get_afile(bestfit_loc+folder,'*'+ID+'*')[0] for folder in folders] # find model from file!!! This is not general! ! model = bfs[-1].split('model_')[-1].replace('.dat.dat','.dat') # Switch to the MC test directories new_place = [bf.replace(bestfit_loc,'MC_TESTS/').replace(folder,folder+dir_name+'/') for bf in bfs] # Best fit pars is the template, we'll swap out sfh file in_pars = open(bfs[0]).readlines() # Load SFHs sfhs = get_afile(mc_dir+'/','SFR*mc*dat') for i,sfh in enumerate(sfhs): mcid = sfh.split('/')[-1].split('.')[1] # need to change jason/me new_names = [np.replace(ext,'.'+mcid+ext) for np,ext in zip(new_place,['.pars','.dat','.dat.dat'])] # New pars file for run_trilgal.py pfile=open(new_names[0],'w') [pfile.write(inpar) for inpar in in_pars[:-1]] #sfh line is at the bottom. pfile.write("%-18s %s\n"%('object_sfr ',sfh)) pfile.close() cmd="/Users/Phil/research/Italy/WXTRILEGAL/run_trilegal.py " #cmd="/home/philrose/WXTRILEGAL/run_trilegal.py " cmd+="-e code_2.0/main " cmd+="-a " cmd+="-i %s "%new_names[1] cmd+="-o %s "%new_names[2] cmd+="-f %s "%model cmd+=new_names[0] cmds_file.write('%s \n'%cmd) print 'running TRILEGAL:',model,ID p = subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE) p = subprocess.Popen(cmd, shell=True) sts = os.waitpid(p.pid, 0)[1] fak_file=get_fakFILE(ID,jason=jason) ast_file = new_names[2].split('/')[0]+'/ast_'+new_names[2].split('/')[-1] cmd="AST/code/spread_angst <<EOF \n" cmd+=fak_file+"\n" cmd+=new_names[2]+"\n" cmd+=ast_file+"\n" cmd+="EOF \n" print " ... completeness using %s"%fak_file print " %s -> %s"%(new_names[2],ast_file) print 'Running spread_angst...' p = subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE) sts = os.waitpid(p.pid, 0)[1] os.system("wc -l %s %s|head -2"%(new_names[2],ast_file)) cmds_file.write('%s \n'%cmd) synthcmd = read_table(ast_file) s_mag2 = synthcmd.get_col('mag2') + synthcmd.get_col('diff_mag2'.strip()) s_mag2 = s_mag2[np.nonzero(abs(synthcmd.get_col('diff_mag2'.strip())) < 90.)[0]] s_mag1 = synthcmd.get_col('mag1') + synthcmd.get_col('diff_mag1'.strip()) s_mag1 = s_mag1[np.nonzero(abs(synthcmd.get_col('diff_mag1'.strip())) < 90.)[0]] s_color = s_mag1-s_mag2 Norm = trgb + 1.5 ind,nB_AGB,nNorm,ps_nNorm,ps_nB_AGBm,hist,bins,s_hist_normed,p_value,normalization = calc_LF(mag2,s_mag2,Norm,trgb) Nstars, flux_rates = flux_from_mass_loss(synthcmd,rates,[filt1,filt2],ast_inds=ind,rel_flux=True) out.write('# mc_ID p_value NRGB_data NAGB_data NRGB_model NAGB_model mass_model N_wind Flux1_wind Flux2_wind\n') out.write('%s %.3f %i %i %i %i %e %i %e %e \n' %(mcid,p_value,nNorm,nB_AGB,ps_nNorm,ps_nB_AGBm,object_mass,Nstars[0],flux_rates[0][1],flux_rates[1][0])) os.remove(ast_file) print 'deleted',ast_file os.remove(new_names[2]) print 'deleted',new_names[2] out.close() cmds_file.close()
def doc_retrieval(claim): # claim = 'Modi is the president of India' # claim = 'Modi is the president of India in 1992' def get_NP(tree, nps): if isinstance(tree, dict): if "children" not in tree: if tree['nodeType'] == "NP": # print(tree['word']) # print(tree) nps.append(tree['word']) elif "children" in tree: if tree['nodeType'] == "NP": # print(tree['word']) nps.append(tree['word']) get_NP(tree['children'], nps) else: get_NP(tree['children'], nps) elif isinstance(tree, list): for sub_tree in tree: get_NP(sub_tree, nps) return nps def get_subjects(tree): subject_words = [] subjects = [] for subtree in tree['children']: if subtree['nodeType'] == "VP" or subtree['nodeType'] == 'S' or subtree['nodeType'] == 'VBZ': subjects.append(' '.join(subject_words)) subject_words.append(subtree['word']) else: subject_words.append(subtree['word']) return subjects # predictor.predict(claim) tokens = predictor.predict(claim) nps = [] tree = tokens['hierplane_tree']['root'] # print(tree) noun_phrases = get_NP(tree, nps) subjects = get_subjects(tree) for subject in subjects: if len(subject) > 0: noun_phrases.append(subject) # noun_phrases = list(set(noun_phrases)) predicted_pages = [] if len(noun_phrases) == 1: for np in noun_phrases: if len(np) > 300: continue docs = wikipedia.search(np) predicted_pages.extend(docs[:2]) # threshold else: for np in noun_phrases: if len(np) > 300: continue docs = wikipedia.search(np) predicted_pages.extend(docs[:1]) wiki_results = set(predicted_pages) # wiki_results = [] # for page in predicted_pages: # page = page.replace(" ", "_") # page = page.replace("(", "-LRB-") # page = page.replace(")", "-RRB-") # page = page.replace(":", "-COLON-") # wiki_results.append(page) # print(wiki_results) noun_phrases = set(noun_phrases) f_predicted_pages = [] for np in noun_phrases: page = np.replace('( ', '-LRB-') page = page.replace(' )', '-RRB-') page = page.replace(' - ', '-') page = page.replace(' -', '-') page = page.replace(' :', '-COLON-') page = page.replace(' ,', ',') page = page.replace(" 's", "'s") page = page.replace(' ', '_') if len(page) < 1: continue f_predicted_pages.append(page) noun_phrases = list(set(noun_phrases)) wiki_results = list(set(wiki_results)) # stop_words = set(stopwords.words('english')) # wiki_results = [w for w in wiki_results if not w in stop_words] claim = normalize(claim) claim = claim.replace(".", "") claim = claim.replace("-", " ") proter_stemm = nltk.PorterStemmer() tokenizer = nltk.word_tokenize words = [proter_stemm.stem(word.lower()) for word in tokenizer(claim)] words = set(words) for page in wiki_results: page = normalize(page) processed_page = re.sub("-LRB-.*?-RRB-", "", page) processed_page = re.sub("_", " ", processed_page) processed_page = re.sub("-COLON-", ":", processed_page) processed_page = processed_page.replace("-", " ") processed_page = processed_page.replace("–", " ") processed_page = processed_page.replace(".", "") page_words = [proter_stemm.stem(word.lower()) for word in tokenizer(processed_page) if len(word) > 0] if all([item in words for item in page_words]): if ':' in page: page = page.replace(":", "-COLON-") f_predicted_pages.append(normalize(page)) f_predicted_pages = list(set(f_predicted_pages)) print(f'Search Entities: {noun_phrases}') print(f'Articles Retrieved: {wiki_results}') print(f'Predicted Retrievals: {f_predicted_pages}') filtered_lines = [] print('looping...') wiki_results_list = [] for result in wiki_results: try: p = wiki_wiki.page(result).text lines = p.split('\n') for line in lines: line.replace('\\', '') if not line.startswith('==') and len(line) > 60: line = nltk.sent_tokenize(line) for l in line: wiki_results_list.append(result) filtered_lines.extend(line) except: print('error') # for result in wiki_results: # try: # p = wiki_wiki.page(result).text # # p = p.replace('\n', ' ') # # p = p.replace('\t', ' ') # # filtered_lines = nltk.sent_tokenize(p) # # filtered_lines = [line for line in filtered_lines if not line.startswith('==') and len(line) > 10 ] # # # Load English tokenizer, tagger, parser, NER and word vectors # nlp = English() # # Create the pipeline 'sentencizer' component # sbd = nlp.create_pipe('sentencizer') # # Add the component to the pipeline # nlp.add_pipe(sbd) # text = p # # "nlp" Object is used to create documents with linguistic annotations. # doc = nlp(text) # # create list of sentence tokens # filtered_lines = [] # for sent in doc.sents: # txt = sent.text # # txt = txt.replace('\n', '') # # txt = txt.replace('\t', '') # filtered_lines.append(txt) # # # lines = p.split('\n') # # for line in lines: # # line.replace('\\', '') # # if not line.startswith('==') and len(line) > 60: # # line = nltk.sent_tokenize(line) # # filtered_lines.extend(line) # except: # print('error') return filtered_lines, wiki_results_list
time_max = 0 """shape (100, 4),Index([0, 1, 2, 3], dtype='int64') dtypes: 1 object 2 int64 3 object dtype: object """ # batch(data.loc[:, 0].values) # data = pd.read_csv(read_url,sep='\s+',delimiter=',', header=None, engine='python', chunksize=50,dtype=np.str)# shape(50, 1) for j, o1 in enumerate(data): if j != 0: break np1 = np.array(o1.loc[:8, 3].values).astype(np.str) print(np.replace(np1, None, fffxxxxx)) # o1 = pd.Series(o1.loc[:8,3]) # # print(o1.map({'None':0})) # print(o1.replace([None],9)) # """max vule at o1 used func:DataFrame.max([axis, skipna, level, …])""" # batch(o1.loc[:,0].values) # o1_max = o1.max() # max_value = o1_max.at[o1_max.index[0]] # print(max_value) # print(type(o1_max),o1_max.shape,o1_max.index) # # print(type(o1),o1.index,o1.columns,o1.shape,o1.dtypes) # # print(type(o1.loc[:,0]),o1.index,o1.columns,o1.shape,o1.loc[:,0].dtypes) # # <class 'pandas.core.series.Series'> RangeIndex(start=0, stop=100, step=1) Int64Index([0, 1, 2, 3], dtype='int64') (100, 4) float64 # # print(type(o1.loc[:,3]),o1.index,o1.columns,o1.shape,o1.loc[:,3].dtypes) # #<class 'pandas.core.series.Series'> RangeIndex(start=0, stop=100, step=1) Int64Index([0, 1, 2, 3], dtype='int64') (100, 4) object