def compare_all(fin1, fdin2):
    # read input data
    if fin1 is None or fin1 =="":
        return False
    texts = ufile.read_csv(fin1) # a specific file or a directory
    # read input data
    if fdin2 is None or fdin2 =="":
        return False
    EDBlist = ufile.load_files (fdin2) # a specific file or a directory
    result = []
    cur = 0
    for text in texts:
        cur += 1
        result_items_new =[]
        result_items = ast.literal_eval(text[2])
        #print result_items
        for result_item in result_items:
            #print result_item[0] in EDBlist
            if result_item[0] in EDBlist:
                result_items_new.append(result_item)
        result.append((text[0], text[1], str(result_items_new)))
       
    # output result
    fout = os.path.splitext(fin1)[0] + "_EDB.csv"
    ufile.write_csv(fout, result)
    print 'saved result into: %s' % fout
    
    print ext_print ('all tasks completed\n')
    return True
Beispiel #2
0
def Extract_nonGT(fdin, fout, fin_, fout_, c):

    #----------------------------------initialize and load supporting data
    # read input data
    all_texts = []
    if fdin is None or fdin == "":
        return False

    elif fdin.endswith(".txt"):
        all_texts = ufile.load_files(fdin)
        if all_texts is None or len(all_texts) <= 0:
            print ext_print(
                'input data error, please check either no such file or no data --- interrupting'
            )
            return
        print ext_print('found a total of %d data items' % len(all_texts))

        output = []
        for text in all_texts:
            text = text.lower()
            result = GAXer_Ggender(text)
            output.append(result)

        # output result
        if (fout is None) or (fout == ""):
            fout = os.path.splitext(fdin)[0] + "_gender.txt"

        ufile.write_file(fout, output, False)

    elif fdin.endswith(".csv"):
        all_texts = ufile.load_files(fdin)  # a specific file or a directory
        all_texts_ = ufile.load_files(fin_)  # a specific file or a directory
        if all_texts is None or len(all_texts) <= 0:
            print ext_print(
                'input data error, please check either no such file or no data --- interrupting'
            )
            return
        print ext_print('found a total of %d data items' % len(all_texts))

        output = []
        output_ = []
        i = 0
        cnt = 0
        cho = 0
        j = 100
        jump = int(j * random.random()) + 2
        goadList = {}
        for t in all_texts_:
            goadList[t[0]] = 1

        for texts in all_texts:
            if i % 1000 == 0:
                print ext_print('processing %d' % i)
            i += 1

            #             if str(texts[0])<>'NCT00002967':
            #                 continue
            cop = texts
            inclusive = texts[5].lower()
            inclusive = inclusive[0:inclusive.find('exclusi')]
            combine_texts = texts[2].lower() + ". " + texts[3].lower(
            ) + ". " + texts[4].lower() + ". " + inclusive
            pre_label = 'Biological ' + texts[1][0].upper() + texts[1][1:]
            result = GAXer_Ggender(combine_texts, pre_label)
            '''
            if 'Transgender' not in str(result):
                FindSame = texts[0] in goadList.keys()
                if not FindSame:
                    if cho==jump:
                        output_.append((cop[0],cop[1],cop[2],cop[3],cop[4],cop[5]))
                        cnt+=1
                        jump=int(j*random.random())+2
                        cho=0
                    cho+=1
            '''
            if 'Transgender' not in str(result):
                FindSame = texts[0] in goadList.keys()
                if not FindSame:
                    output_.append(
                        (cop[0], cop[1], cop[2], cop[3], cop[4], cop[5]))
                    cnt += 1
            if cnt == c:
                break

            if len(result) == 0 or (len(texts[1]) > 0 and len(result) == 1
                                    and pre_label in result):
                continue
            else:
                t = texts[0]
                t = t.replace('"', '')
                t = str(t)
                output.append((t, texts[1], str(result)))

        # output result
        if (fout is None) or (fout == ""):
            fout = os.path.splitext(fdin)[0] + "_gender.csv"

        ufile.write_csv(fout, output)
        ufile.write_csv(fout_, output_)

    print ext_print('saved processed results into: %s' % fout)

    print ext_print('all tasks completed\n')
    return True
Beispiel #3
0
def GAXer_wrapper(fdin, fout=None):

    #----------------------------------initialize and load supporting data
    # read input data
    all_texts = []
    if fdin is None or fdin == "":
        return False

    elif fdin.endswith(".txt"):
        all_texts = ufile.load_files(fdin)
        if all_texts is None or len(all_texts) <= 0:
            print ext_print(
                'input data error, please check either no such file or no data --- interrupting'
            )
            return
        print ext_print('found a total of %d data items' % len(all_texts))

        output = []
        for text in all_texts:
            text = text.lower()
            result = GAXer_Ggender(text)
            output.append(result)

        # output result
        if (fout is None) or (fout == ""):
            fout = os.path.splitext(fdin)[0] + "_gender.txt"

        ufile.write_file(fout, output, False)

    elif fdin.endswith(".csv"):
        all_texts = ufile.load_files(fdin)  # a specific file or a directory
        if all_texts is None or len(all_texts) <= 0:
            print ext_print(
                'input data error, please check either no such file or no data --- interrupting'
            )
            return
        print ext_print('found a total of %d data items' % len(all_texts))

        output = []
        i = 0
        for texts in all_texts:
            if i % 1000 == 0:
                print ext_print('processing %d' % i)
            i += 1

            #             if str(texts[0])<>'NCT00002967':
            #                 continue
            inclusive = texts[5].lower()
            inclusive = inclusive[0:inclusive.find('exclusi')]
            #            combine_texts = texts[2].lower() + ". " + texts[3].lower() + ". " + texts[4].lower() + ". " + inclusive
            combine_texts = texts[3].lower() + ". " + texts[4].lower(
            ) + ". " + inclusive
            pre_label = 'Biological ' + texts[1][0].upper() + texts[1][1:]
            result = GAXer_Ggender(combine_texts, pre_label)
            #            print result
            #            if len(result)==0 or (len(texts[1])>0 and len(result)==1 and pre_label in result):
            if len(result) == 0:
                continue
            else:
                t = texts[0]
                t = t.replace('"', '')
                t = str(t)
                output.append((t, texts[1], str(result)))

        # output result
        if (fout is None) or (fout == ""):
            fout = os.path.splitext(fdin)[0] + "_gender.csv"

        ufile.write_csv(fout, output)

    print ext_print('saved processed results into: %s' % fout)

    print ext_print('all tasks completed\n')
    return True
Beispiel #4
0
def compare_all(fin1, fdin2, fdin3):
    # read input data
    if fin1 is None or fin1 =="":
        return False
    texts = ufile.read_csv(fin1) # a specific file or a directory
    # read input data
    if fdin2 is None or fdin2 =="":
        return False
    EDBlist = ufile.load_files (fdin2) # a specific file or a directory
    # read input data
    if fdin3 is None or fdin3 =="":
        return False
    FreCorpus = ufile.read_file_dict_tokenized(fdin3, '\t')
    
    result = []
    words_sims = {}
    cur = 0
    for text in texts:
        cur += 1
        if len(text[2].split('.')) > 1:
            target_word, pos = text[2].split('.')[0], text[2].split('.')[1]
        else:
            target_word, pos = text[2], None
        print "%d of %d" % (cur, len(texts)), target_word
        simi_values = []
        if target_word not in words_sims:
            processed = []
            processed.append(target_word)
            # step 1 ============== 
            can_words =[]
            syn = wordnet.synsets(target_word)
            if len(syn) > 0:
                for l in syn[0].lemmas():
                    if l.name() not in can_words:
                        can_words.append(l.name())
            word_fre = {}
            for word_each in can_words:
                if word_each in EDBlist and word_each not in processed:
                    word_each_fre = 0 
                    if (word_each in FreCorpus):
                        word_each_fre = int(FreCorpus[word_each])
                    word_fre[word_each] = word_each_fre
                    processed.append(word_each)
            word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value
            simi_values.extend(word_fre)
            # step 2 ==============  
            can_words =[]
            syn = wordnet.synsets(target_word)
            if len(syn) > 0:
                syn_word = syn[0].hypernyms()
                for l in syn_word:
                    if (l.pos() in ['v', 'n', 'a']):
                        for k in l.lemmas():
                            if k.name() not in can_words:
                                can_words.append(k.name())
            word_fre = {}
            for word_each in can_words:
                if word_each in EDBlist and word_each not in processed:
                    word_each_fre = 0 
                    if (word_each in FreCorpus):
                        word_each_fre = int(FreCorpus[word_each])
                    word_fre[word_each] = word_each_fre
                    processed.append(word_each)
            word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value
            simi_values.extend(word_fre)
            # step 3 ==============  
            can_words =[]
            for syn in wordnet.synsets(target_word):
                for l in syn.lemmas():
                    if l.name() not in can_words:
                        can_words.append(l.name())
            word_fre = {}
            for word_each in can_words:
                if word_each in EDBlist and word_each not in processed:
                    word_each_fre = 0 
                    if (word_each in FreCorpus):
                        word_each_fre = int(FreCorpus[word_each])
                    word_fre[word_each] = word_each_fre
                    processed.append(word_each)
            word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value
            simi_values.extend(word_fre)
            # step 4 ==============  
            can_words =[]
            for syn in wordnet.synsets(target_word):
                syn_word = syn.hypernyms()
                for l in syn_word:
                    if (l.pos() in ['v', 'n', 'a']):
                        for k in l.lemmas():
                            if k.name() not in can_words:
                                can_words.append(k.name())
            word_fre = {}
            for word_each in can_words:
                if word_each in EDBlist and word_each not in processed:
                    word_each_fre = 0 
                    if (word_each in FreCorpus):
                        word_each_fre = int(FreCorpus[word_each])
                    word_fre[word_each] = word_each_fre
                    processed.append(word_each)
            word_fre = sorted(word_fre.items(), key=operator.itemgetter(1), reverse=True) # sort by rank value
            simi_values.extend(word_fre)                  
            #=================================
            words_sims[target_word] = simi_values
            print simi_values[:2]
        else:
            simi_values = words_sims[target_word]
        result.append((text[0], text[2], simi_values))
       
    # output result
    fout = os.path.splitext(fin1)[0] + "_4steps.csv"
    ufile.write_csv(fout, result)
    print 'saved result into: %s' % fout
    
    print ext_print ('all tasks completed\n')
    return True
Beispiel #5
0
def compare_all(fin1, fdin2, method, threasholds):
    # read input data
    if fin1 is None or fin1 == "":
        return False
    texts = ufile.read_csv(fin1)  # a specific file or a directory
    # read input data
    if fdin2 is None or fdin2 == "":
        return False
    EDBlist = ufile.load_files(fdin2)  # a specific file or a directory

    threasholds = threasholds.split(';')

    # 过滤掉原型词与同词根的词
    porter_stemmer = PorterStemmer()
    wnl = WordNetLemmatizer()
    gold, fre = [], []
    for threashold in threasholds:
        result = []
        words_sims = {}
        start_time = time.time()
        cur = 0
        for text in texts:
            cur += 1
            for i in range(len(text[3].split(";"))):
                fre.append(text[3].split(";")[i].split(":")[1])
                gold.append(text[3].split(";")[i].split(":")[0])
            if len(text[2].split('.')) > 1:
                target_word, pos = text[2].split('.')[0], text[2].split('.')[1]
                stemming_tw = porter_stemmer.stem(target_word)
                lemma_tw = wordnet.morphy(target_word, pos=pos)
                #lemma_tw = wnl.lemmatize(target_word, pos)
                print lemma_tw

            else:
                target_word, pos = text[2], None
                stemming_tw = porter_stemmer.stem(target_word)
                lemma_tw = wordnet.morphy(target_word)
                #lemma_tw = wnl.lemmatize(target_word, pos)

            print("%d of %d" % (cur, len(texts)), target_word)
            simi_values = []

            if target_word not in words_sims:
                word_sim = {}
                for word2 in EDBlist:
                    stemming_cw = porter_stemmer.stem(word2)
                    lemma_word = wordnet.morphy(word2)
                    if word2 not in word_sim:
                        #if target_word !=word2:
                        if target_word != word2 and stemming_cw != stemming_tw and lemma_word != lemma_tw:
                            # simi_value=compare_allsynsets(method, target_word, word2, TWpos, SYNpos, pos)
                            # simi_value = compare_allsynsets(method, target_word, word2, TWpos, pos)
                            # simi_value = compare_allsynsets(method, target_word, word2, SYNpos)
                            simi_value = compare_allsynsets(
                                method, target_word, word2)
                            if simi_value > float(threashold):
                                word_sim[word2] = round(float(simi_value), 3)
                simi_values = sorted(word_sim.items(),
                                     key=operator.itemgetter(1),
                                     reverse=True)  # sort by rank value
                words_sims[target_word] = simi_values
            else:
                simi_values = words_sims[target_word]
            result.append((text[0], text[2], simi_values))
        print("--- %s seconds ---" % (time.time() - start_time))
        # output result
        fout = os.path.splitext(fin1)[0] + "_%s_%s.csv" % (method, threashold)
        # if SYNpos:
        #     fout = fout.replace(".csv", "_SYNpos.csv")
        # if TWpos:
        #     fout = fout.replace(".csv", "_TWpos.csv")
        ufile.write_csv(fout, result)
        print('saved result into: %s' % fout)

    print(ext_print('all tasks completed\n'))
    return True