def find_tagged_NPs(chunked_text):
    """Find tagged phrases in Switchboard."""
    noun_phrases = re.findall(r"\[.*?\]", chunked_text)
    chunked_nps = []
    for np in noun_phrases:
        new_np = np.replace("[", "").replace("]", "").strip()
        constituents = new_np.split(" ")
        chunked_nps.append([tuple(c.split("/")) for c in constituents])
    return chunked_nps
Beispiel #2
0
 def assign_word_spans(self, ent):
     chunk_spans = []
     seen_chunks = []
     token_spans = self.data()['parse']['noun_phrase_chunks']['token_spans']
     doc = self.data(
     )['parse']['noun_phrase_chunks']['aligned_description'].lower()
     noun_phrases_w_spans = self.data(
     )['parse']['noun_phrase_chunks']['named_chunks']
     # for np in noun_phrases_w_spans:
     np = ent.data()['entityLabel'].lower().strip()
     char_spans = [(m.start(), m.end() - 1)
                   for m in re.finditer(np + '\s|' + np + '\.', doc)]
     if not char_spans:
         np = sanitize_text(np)
         for punct in [',', '-', '/', '.']:
             if punct not in np:
                 continue
             if punct not in doc or punct == np[-1]:
                 if punct + ' ' in np or punct == np[-1]:
                     np = np.replace(punct, '')
                 else:
                     np = np.replace(punct, ' ')
         char_spans = [(m.start(), m.end() - 1)
                       for m in re.finditer(np + '\s|' + np + '\.', doc)]
         # print(char_spans)
         # print(np, doc)
     occ_n = seen_chunks.count(np)
     start, end = char_spans[occ_n]
     start_w, end_w = None, None
     for w_idx, token_span in enumerate(token_spans):
         token, ts, te = token_span
         if ts == start:
             start_w = w_idx
         if te == end:
             end_w = w_idx + 1
     if type(start_w) == int and type(end_w) == int:
         chunk_spans.append([start_w, end_w])
     else:
         # print(np)
         # print('failed')
         raise IndexError
     np_pieces = np.split()
     seen_chunks += list(set(np_pieces).union(set([np])))
     return chunk_spans[-1]
Beispiel #3
0
    def parse_np(index):
        np = ''
        closing = 0
        for elem in tree[index:]:
            if elem[0] == '(':
                closing += 1
            else:
                match = re.findall("\)", elem)

                np += elem.replace(')', '').strip() + ' '

                closing -= len(match)
                if closing <= 0:
                    break
        return np.replace('-LRB- ', '(').replace(' -RRB-', ')').replace('-LRB-', '(').replace('-RRB-', ')').strip().lower()
Beispiel #4
0
def mc_tests(ID, dir_name, outdir="MC_TESTS", bestfit_loc="MODELS/"):
    '''
    ID is the Galaxy name
    dir_name is PropID_GalaxyName
    Will put mc tests in MC_TESTS/*/dirname
    '''

    # Make new folders
    folders = ['PARS/', 'INPUT/', 'OUTPUT/']
    outdir = check_mkdir(outdir)
    dummy = [check_mkdir(os.path.join(outdir, folder)) for folder in folders]
    this_outdir = check_mkdir(os.path.join(outdir, folder, dir_name))

    # A place to write the commands if this needs to be run again
    cmds_out = check_mkdir(os.path.join(outdir, 'MC_COMMANDS/'))
    cmds_file = open('%s/%s_MC_commands.dat' % (cmds_out, ID), 'r')

    # Place for the output table
    table_out = check_mkdir(os.path.join(outdir, 'TABLES/'))
    out = open('%s/%s_MC.dat' % (table_out, ID), 'w')
    out.write(
        '# mc_ID p_value NRGB_data NAGB_data NRGB_model NAGB_model mass_model N_wind Flux1_wind Flux2_wind\n'
    )

    #best fits: bfs[0] = pars for run_trilegal, bfs[1] = input for trilegal bfs[2] = output for trilegal
    bfs = [
        get_afile(bestfit_loc + folder, '*' + ID + '*')[0]
        for folder in folders
    ]

    # find model from file!!! This is not general! !
    model = bfs[-1].split('model_')[-1].replace('.dat.dat', '.dat')

    # Switch to the MC test directories
    new_place = [
        bf.replace(bestfit_loc, 'MC_TESTS/').replace(folder,
                                                     folder + dir_name + '/')
        for bf in bfs
    ]

    # Best fit pars is the template, we'll swap out sfh file
    in_pars = open(bfs[0]).readlines()

    # Load SFHs
    sfhs = get_afile(mc_dir + '/', 'SFR*mc*dat')

    for i, sfh in enumerate(sfhs):
        mcid = sfh.split('/')[-1].split('.')[1]  # need to change jason/me
        new_names = [
            np.replace(ext, '.' + mcid + ext)
            for np, ext in zip(new_place, ['.pars', '.dat', '.dat.dat'])
        ]
        # New pars file for run_trilgal.py
        pfile = open(new_names[0], 'w')
        [pfile.write(inpar)
         for inpar in in_pars[:-1]]  #sfh line is at the bottom.
        pfile.write("%-18s %s\n" % ('object_sfr  ', sfh))
        pfile.close()

        cmd = "/Users/Phil/research/Italy/WXTRILEGAL/run_trilegal.py "
        #cmd="/home/philrose/WXTRILEGAL/run_trilegal.py "
        cmd += "-e code_2.0/main "
        cmd += "-a "
        cmd += "-i %s " % new_names[1]
        cmd += "-o %s " % new_names[2]
        cmd += "-f %s " % model
        cmd += new_names[0]

        cmds_file.write('%s \n' % cmd)
        print 'running TRILEGAL:', model, ID
        p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
        p = subprocess.Popen(cmd, shell=True)
        sts = os.waitpid(p.pid, 0)[1]

        fak_file = get_fakFILE(ID, jason=jason)
        ast_file = new_names[2].split('/')[0] + '/ast_' + new_names[2].split(
            '/')[-1]
        cmd = "AST/code/spread_angst <<EOF \n"
        cmd += fak_file + "\n"
        cmd += new_names[2] + "\n"
        cmd += ast_file + "\n"
        cmd += "EOF \n"
        print "  ... completeness using %s" % fak_file
        print "  %s -> %s" % (new_names[2], ast_file)
        print 'Running spread_angst...'
        p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
        sts = os.waitpid(p.pid, 0)[1]
        os.system("wc -l %s %s|head -2" % (new_names[2], ast_file))

        cmds_file.write('%s \n' % cmd)

        synthcmd = read_table(ast_file)
        s_mag2 = synthcmd.get_col('mag2') + synthcmd.get_col(
            'diff_mag2'.strip())
        s_mag2 = s_mag2[np.nonzero(
            abs(synthcmd.get_col('diff_mag2'.strip())) < 90.)[0]]
        s_mag1 = synthcmd.get_col('mag1') + synthcmd.get_col(
            'diff_mag1'.strip())
        s_mag1 = s_mag1[np.nonzero(
            abs(synthcmd.get_col('diff_mag1'.strip())) < 90.)[0]]
        s_color = s_mag1 - s_mag2
        Norm = trgb + 1.5
        ind, nB_AGB, nNorm, ps_nNorm, ps_nB_AGBm, hist, bins, s_hist_normed, p_value, normalization = calc_LF(
            mag2, s_mag2, Norm, trgb)
        Nstars, flux_rates = flux_from_mass_loss(synthcmd,
                                                 rates, [filt1, filt2],
                                                 ast_inds=ind,
                                                 rel_flux=True)
        out.write(
            '# mc_ID p_value NRGB_data NAGB_data NRGB_model NAGB_model mass_model N_wind Flux1_wind Flux2_wind\n'
        )
        out.write('%s %.3f %i %i %i %i %e %i %e %e \n' %
                  (mcid, p_value, nNorm, nB_AGB, ps_nNorm, ps_nB_AGBm,
                   object_mass, Nstars[0], flux_rates[0][1], flux_rates[1][0]))

        os.remove(ast_file)
        print 'deleted', ast_file
        os.remove(new_names[2])
        print 'deleted', new_names[2]

    out.close()
    cmds_file.close()
Beispiel #5
0
def mc_tests(ID,dir_name,outdir="MC_TESTS",bestfit_loc = "MODELS/"):
    '''
    ID is the Galaxy name
    dir_name is PropID_GalaxyName
    Will put mc tests in MC_TESTS/*/dirname
    '''
    
    # Make new folders
    folders = ['PARS/','INPUT/','OUTPUT/']
    outdir = check_mkdir(outdir)
    dummy = [check_mkdir(os.path.join(outdir,folder)) for folder in folders]
    this_outdir = check_mkdir(os.path.join(outdir,folder,dir_name))
    
    # A place to write the commands if this needs to be run again
    cmds_out = check_mkdir(os.path.join(outdir,'MC_COMMANDS/'))
    cmds_file = open('%s/%s_MC_commands.dat'%(cmds_out,ID),'r')
    
    # Place for the output table
    table_out = check_mkdir(os.path.join(outdir,'TABLES/'))
    out = open('%s/%s_MC.dat'%(table_out,ID),'w')
    out.write('# mc_ID p_value NRGB_data NAGB_data NRGB_model NAGB_model mass_model N_wind Flux1_wind Flux2_wind\n')

    #best fits: bfs[0] = pars for run_trilegal, bfs[1] = input for trilegal bfs[2] = output for trilegal
    bfs = [get_afile(bestfit_loc+folder,'*'+ID+'*')[0] for folder in  folders]
    
    # find model from file!!! This is not general! ! 
    model = bfs[-1].split('model_')[-1].replace('.dat.dat','.dat')
    
    # Switch to the MC test directories
    new_place = [bf.replace(bestfit_loc,'MC_TESTS/').replace(folder,folder+dir_name+'/') for bf in bfs]
    
    # Best fit pars is the template, we'll swap out sfh file
    in_pars = open(bfs[0]).readlines()
    
    # Load SFHs
    sfhs = get_afile(mc_dir+'/','SFR*mc*dat')
    
    for i,sfh in enumerate(sfhs):
        mcid = sfh.split('/')[-1].split('.')[1] # need to change jason/me
        new_names = [np.replace(ext,'.'+mcid+ext) for np,ext in zip(new_place,['.pars','.dat','.dat.dat'])]
        # New pars file for run_trilgal.py
        pfile=open(new_names[0],'w')
        [pfile.write(inpar) for inpar in in_pars[:-1]] #sfh line is at the bottom.
        pfile.write("%-18s %s\n"%('object_sfr  ',sfh)) 
        pfile.close()
        
        cmd="/Users/Phil/research/Italy/WXTRILEGAL/run_trilegal.py "
        #cmd="/home/philrose/WXTRILEGAL/run_trilegal.py "
        cmd+="-e code_2.0/main "
        cmd+="-a "
        cmd+="-i %s "%new_names[1]
        cmd+="-o %s "%new_names[2]
        cmd+="-f %s "%model
        cmd+=new_names[0]
        
        cmds_file.write('%s \n'%cmd)
        print 'running TRILEGAL:',model,ID
        p = subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE)
        p = subprocess.Popen(cmd, shell=True)
        sts = os.waitpid(p.pid, 0)[1]
        
        fak_file=get_fakFILE(ID,jason=jason)
        ast_file = new_names[2].split('/')[0]+'/ast_'+new_names[2].split('/')[-1]
        cmd="AST/code/spread_angst <<EOF \n"
        cmd+=fak_file+"\n"
        cmd+=new_names[2]+"\n"
        cmd+=ast_file+"\n"
        cmd+="EOF \n"
        print "  ... completeness using %s"%fak_file
        print "  %s -> %s"%(new_names[2],ast_file)
        print 'Running spread_angst...'
        p = subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE)
        sts = os.waitpid(p.pid, 0)[1]
        os.system("wc -l %s %s|head -2"%(new_names[2],ast_file))
        
        cmds_file.write('%s \n'%cmd)
        
        synthcmd = read_table(ast_file)
        s_mag2 = synthcmd.get_col('mag2') + synthcmd.get_col('diff_mag2'.strip())
        s_mag2 = s_mag2[np.nonzero(abs(synthcmd.get_col('diff_mag2'.strip())) < 90.)[0]]
        s_mag1 = synthcmd.get_col('mag1') + synthcmd.get_col('diff_mag1'.strip())
        s_mag1 = s_mag1[np.nonzero(abs(synthcmd.get_col('diff_mag1'.strip())) < 90.)[0]]
        s_color = s_mag1-s_mag2
        Norm = trgb + 1.5
        ind,nB_AGB,nNorm,ps_nNorm,ps_nB_AGBm,hist,bins,s_hist_normed,p_value,normalization = calc_LF(mag2,s_mag2,Norm,trgb)
        Nstars, flux_rates = flux_from_mass_loss(synthcmd,rates,[filt1,filt2],ast_inds=ind,rel_flux=True)
        out.write('# mc_ID p_value NRGB_data NAGB_data NRGB_model NAGB_model mass_model N_wind Flux1_wind Flux2_wind\n')
        out.write('%s %.3f %i %i %i %i %e %i %e %e \n' %(mcid,p_value,nNorm,nB_AGB,ps_nNorm,ps_nB_AGBm,object_mass,Nstars[0],flux_rates[0][1],flux_rates[1][0]))
    
        os.remove(ast_file)
        print 'deleted',ast_file
        os.remove(new_names[2])
        print 'deleted',new_names[2]

    out.close()
    cmds_file.close()
Beispiel #6
0
def doc_retrieval(claim):
    # claim = 'Modi is the president of India'

    # claim = 'Modi is the president of India in 1992'

    def get_NP(tree, nps):

        if isinstance(tree, dict):
            if "children" not in tree:
                if tree['nodeType'] == "NP":
                    # print(tree['word'])
                    # print(tree)
                    nps.append(tree['word'])
            elif "children" in tree:
                if tree['nodeType'] == "NP":
                    # print(tree['word'])
                    nps.append(tree['word'])
                    get_NP(tree['children'], nps)
                else:
                    get_NP(tree['children'], nps)
        elif isinstance(tree, list):
            for sub_tree in tree:
                get_NP(sub_tree, nps)

        return nps

    def get_subjects(tree):
        subject_words = []
        subjects = []
        for subtree in tree['children']:
            if subtree['nodeType'] == "VP" or subtree['nodeType'] == 'S' or subtree['nodeType'] == 'VBZ':
                subjects.append(' '.join(subject_words))
                subject_words.append(subtree['word'])
            else:
                subject_words.append(subtree['word'])
        return subjects

    # predictor.predict(claim)
    tokens = predictor.predict(claim)
    nps = []
    tree = tokens['hierplane_tree']['root']
    # print(tree)
    noun_phrases = get_NP(tree, nps)

    subjects = get_subjects(tree)
    for subject in subjects:
        if len(subject) > 0:
            noun_phrases.append(subject)
    # noun_phrases = list(set(noun_phrases))

    predicted_pages = []
    if len(noun_phrases) == 1:
        for np in noun_phrases:
            if len(np) > 300:
                continue
            docs = wikipedia.search(np)

            predicted_pages.extend(docs[:2])  # threshold

    else:
        for np in noun_phrases:
            if len(np) > 300:
                continue
            docs = wikipedia.search(np)

            predicted_pages.extend(docs[:1])

    wiki_results = set(predicted_pages)

    # wiki_results = []
    # for page in predicted_pages:
    #     page = page.replace(" ", "_")
    #     page = page.replace("(", "-LRB-")
    #     page = page.replace(")", "-RRB-")
    #     page = page.replace(":", "-COLON-")
    #     wiki_results.append(page)
    # print(wiki_results)

    noun_phrases = set(noun_phrases)
    f_predicted_pages = []
    for np in noun_phrases:
        page = np.replace('( ', '-LRB-')
        page = page.replace(' )', '-RRB-')
        page = page.replace(' - ', '-')
        page = page.replace(' -', '-')
        page = page.replace(' :', '-COLON-')
        page = page.replace(' ,', ',')
        page = page.replace(" 's", "'s")
        page = page.replace(' ', '_')

        if len(page) < 1:
            continue
        f_predicted_pages.append(page)

    noun_phrases = list(set(noun_phrases))

    wiki_results = list(set(wiki_results))

    # stop_words = set(stopwords.words('english'))
    # wiki_results = [w for w in wiki_results if not w in stop_words]

    claim = normalize(claim)
    claim = claim.replace(".", "")
    claim = claim.replace("-", " ")
    proter_stemm = nltk.PorterStemmer()
    tokenizer = nltk.word_tokenize
    words = [proter_stemm.stem(word.lower()) for word in tokenizer(claim)]
    words = set(words)

    for page in wiki_results:
        page = normalize(page)
        processed_page = re.sub("-LRB-.*?-RRB-", "", page)
        processed_page = re.sub("_", " ", processed_page)
        processed_page = re.sub("-COLON-", ":", processed_page)
        processed_page = processed_page.replace("-", " ")
        processed_page = processed_page.replace("–", " ")
        processed_page = processed_page.replace(".", "")
        page_words = [proter_stemm.stem(word.lower()) for word in tokenizer(processed_page) if
                      len(word) > 0]

        if all([item in words for item in page_words]):
            if ':' in page:
                page = page.replace(":", "-COLON-")
            f_predicted_pages.append(normalize(page))
    f_predicted_pages = list(set(f_predicted_pages))

    print(f'Search Entities: {noun_phrases}')
    print(f'Articles Retrieved: {wiki_results}')
    print(f'Predicted Retrievals: {f_predicted_pages}')

    filtered_lines = []
    print('looping...')

    wiki_results_list = []
    for result in wiki_results:
        try:
            p = wiki_wiki.page(result).text
            lines = p.split('\n')
            for line in lines:
                line.replace('\\', '')
                if not line.startswith('==') and len(line) > 60:
                    line = nltk.sent_tokenize(line)
                    for l in line:
                        wiki_results_list.append(result)
                    filtered_lines.extend(line)
        except:
            print('error')


    # for result in wiki_results:
    #     try:
    #         p = wiki_wiki.page(result).text
    #         # p = p.replace('\n', ' ')
    #         # p = p.replace('\t', ' ')
    #         # filtered_lines = nltk.sent_tokenize(p)
    #         # filtered_lines = [line for line in filtered_lines if not line.startswith('==') and len(line) > 10 ]
    #
    #         # Load English tokenizer, tagger, parser, NER and word vectors
    #         nlp = English()
    #         # Create the pipeline 'sentencizer' component
    #         sbd = nlp.create_pipe('sentencizer')
    #         # Add the component to the pipeline
    #         nlp.add_pipe(sbd)
    #         text = p
    #         #  "nlp" Object is used to create documents with linguistic annotations.
    #         doc = nlp(text)
    #         # create list of sentence tokens
    #         filtered_lines = []
    #         for sent in doc.sents:
    #             txt = sent.text
    #             # txt = txt.replace('\n', '')
    #             # txt = txt.replace('\t', '')
    #             filtered_lines.append(txt)
    #
    #     #     lines = p.split('\n')
    #     #     for line in lines:
    #     #         line.replace('\\', '')
    #     #         if not line.startswith('==') and len(line) > 60:
    #     #             line = nltk.sent_tokenize(line)
    #     #             filtered_lines.extend(line)
    #     except:
    #         print('error')

    return filtered_lines, wiki_results_list
        time_max = 0
        """shape (100, 4),Index([0, 1, 2, 3], dtype='int64')
        dtypes: 1     object
                2      int64
                3     object
                dtype: object
        """
        # batch(data.loc[:, 0].values)

        # data = pd.read_csv(read_url,sep='\s+',delimiter=',', header=None, engine='python', chunksize=50,dtype=np.str)# shape(50, 1)
        for j, o1 in enumerate(data):
            if j != 0:
                break
            np1 = np.array(o1.loc[:8, 3].values).astype(np.str)
            print(np.replace(np1, None, fffxxxxx))

            # o1 = pd.Series(o1.loc[:8,3])
            # # print(o1.map({'None':0}))
            # print(o1.replace([None],9))
        #     """max vule at o1 used func:DataFrame.max([axis, skipna, level, …])"""
        #     batch(o1.loc[:,0].values)
        #     o1_max = o1.max()
        #     max_value = o1_max.at[o1_max.index[0]]
        #     print(max_value)
        #     print(type(o1_max),o1_max.shape,o1_max.index)
        #     # print(type(o1),o1.index,o1.columns,o1.shape,o1.dtypes)
        #     # print(type(o1.loc[:,0]),o1.index,o1.columns,o1.shape,o1.loc[:,0].dtypes)
        #     # <class 'pandas.core.series.Series'> RangeIndex(start=0, stop=100, step=1) Int64Index([0, 1, 2, 3], dtype='int64') (100, 4) float64
        #     # print(type(o1.loc[:,3]),o1.index,o1.columns,o1.shape,o1.loc[:,3].dtypes)
        #     #<class 'pandas.core.series.Series'> RangeIndex(start=0, stop=100, step=1) Int64Index([0, 1, 2, 3], dtype='int64') (100, 4) object