コード例 #1
0
ファイル: TEXer_English.py プロジェクト: techwitz/TEER
def temporal_processing(fin,
                        fout=None,
                        type="testing",
                        fin_t=None,
                        rep_enable=False,
                        rep_word="",
                        event=False,
                        X3=False):

    # read the input data
    if (fin) is None:
        print ext_print('no input file found --- interrupting')
        return
    texts = ufile.read_file(fin, 1, False)
    if texts is None or len(texts) <= 0:
        print ext_print('no text available for processing --- interrupting')
        return

    print ext_print('start to process temporal information in text file %s' %
                    fin)

    if type == "training":
        tpatts = temporal_training(texts)

        # output pattern result
        if (fout is None) or (fout == ""):
            fout = os.path.splitext(fin)[0] + "_pat" + os.path.splitext(fin)[1]

        ufile.write_file(fout, sorted(tpatts, key=tpatts.get, reverse=True),
                         False)
        print ext_print('saved trained patterns into: %s' % fout)

    elif type == "testing":
        # read the pattern data
        if (fin_t) is None:
            print ext_print('no pattern file found --- interrupting')
            return
        tpatts = ufile.read_file(fin_t, 1, False)
        if tpatts is None or len(tpatts) <= 0:
            print ext_print(
                'no patterns available for processing --- interrupting')
            return

        result = temporal_testing(texts, tpatts, rep_enable, rep_word, event)
        if X3:
            result = using_TimeX3(result)

        # output result
        if (fout is None) or (fout == ""):
            if X3:
                fout = os.path.splitext(fin)[0] + "_TEXer.xml"
            else:
                fout = os.path.splitext(fin)[0] + "_TEXer" + os.path.splitext(
                    fin)[1]

        ufile.write_file(fout, result, False)
        print ext_print('saved processed results into: %s' % fout)

    print ext_print('all tasks completed\n')
    return True
コード例 #2
0
def file_merge(fdin, fout, columns, format):
    # read input data
    if fdin is None or fdin == "":
        return False
    texts = ufile.read(fdin)  # a specific file or a directory
    result = []
    print texts
    if columns == "all":
        result = texts
    else:
        cols = columns.split('|')
        for text in texts:
            if len(cols) == 1:
                result.append(text[int(cols) - 1])
            else:
                for col in cols:
                    result.append(text[int(col) - 1])

    print ext_print('get %d in total' % len(result))

    # get output data directory
    if fout is None:
        fout = os.path.splitext(fdin)[0] + "_merged" + format
    # output detailed result into file
    if format == "" or ".txt":
        ufile.write_file(fout, result, False)
    elif format == ".csv":
        ufile.write_csv(fout, result)
    print ext_print('saved result into: %s' % fout)
    print ext_print('all tasks completed\n')
    return True
コード例 #3
0
def Extract_nonGT(fdin, fout, fin_, fout_, c):

    #----------------------------------initialize and load supporting data
    # read input data
    all_texts = []
    if fdin is None or fdin == "":
        return False

    elif fdin.endswith(".txt"):
        all_texts = ufile.load_files(fdin)
        if all_texts is None or len(all_texts) <= 0:
            print ext_print(
                'input data error, please check either no such file or no data --- interrupting'
            )
            return
        print ext_print('found a total of %d data items' % len(all_texts))

        output = []
        for text in all_texts:
            text = text.lower()
            result = GAXer_Ggender(text)
            output.append(result)

        # output result
        if (fout is None) or (fout == ""):
            fout = os.path.splitext(fdin)[0] + "_gender.txt"

        ufile.write_file(fout, output, False)

    elif fdin.endswith(".csv"):
        all_texts = ufile.load_files(fdin)  # a specific file or a directory
        all_texts_ = ufile.load_files(fin_)  # a specific file or a directory
        if all_texts is None or len(all_texts) <= 0:
            print ext_print(
                'input data error, please check either no such file or no data --- interrupting'
            )
            return
        print ext_print('found a total of %d data items' % len(all_texts))

        output = []
        output_ = []
        i = 0
        cnt = 0
        cho = 0
        j = 100
        jump = int(j * random.random()) + 2
        goadList = {}
        for t in all_texts_:
            goadList[t[0]] = 1

        for texts in all_texts:
            if i % 1000 == 0:
                print ext_print('processing %d' % i)
            i += 1

            #             if str(texts[0])<>'NCT00002967':
            #                 continue
            cop = texts
            inclusive = texts[5].lower()
            inclusive = inclusive[0:inclusive.find('exclusi')]
            combine_texts = texts[2].lower() + ". " + texts[3].lower(
            ) + ". " + texts[4].lower() + ". " + inclusive
            pre_label = 'Biological ' + texts[1][0].upper() + texts[1][1:]
            result = GAXer_Ggender(combine_texts, pre_label)
            '''
            if 'Transgender' not in str(result):
                FindSame = texts[0] in goadList.keys()
                if not FindSame:
                    if cho==jump:
                        output_.append((cop[0],cop[1],cop[2],cop[3],cop[4],cop[5]))
                        cnt+=1
                        jump=int(j*random.random())+2
                        cho=0
                    cho+=1
            '''
            if 'Transgender' not in str(result):
                FindSame = texts[0] in goadList.keys()
                if not FindSame:
                    output_.append(
                        (cop[0], cop[1], cop[2], cop[3], cop[4], cop[5]))
                    cnt += 1
            if cnt == c:
                break

            if len(result) == 0 or (len(texts[1]) > 0 and len(result) == 1
                                    and pre_label in result):
                continue
            else:
                t = texts[0]
                t = t.replace('"', '')
                t = str(t)
                output.append((t, texts[1], str(result)))

        # output result
        if (fout is None) or (fout == ""):
            fout = os.path.splitext(fdin)[0] + "_gender.csv"

        ufile.write_csv(fout, output)
        ufile.write_csv(fout_, output_)

    print ext_print('saved processed results into: %s' % fout)

    print ext_print('all tasks completed\n')
    return True
コード例 #4
0
def GAXer_wrapper(fdin, fout=None):

    #----------------------------------initialize and load supporting data
    # read input data
    all_texts = []
    if fdin is None or fdin == "":
        return False

    elif fdin.endswith(".txt"):
        all_texts = ufile.load_files(fdin)
        if all_texts is None or len(all_texts) <= 0:
            print ext_print(
                'input data error, please check either no such file or no data --- interrupting'
            )
            return
        print ext_print('found a total of %d data items' % len(all_texts))

        output = []
        for text in all_texts:
            text = text.lower()
            result = GAXer_Ggender(text)
            output.append(result)

        # output result
        if (fout is None) or (fout == ""):
            fout = os.path.splitext(fdin)[0] + "_gender.txt"

        ufile.write_file(fout, output, False)

    elif fdin.endswith(".csv"):
        all_texts = ufile.load_files(fdin)  # a specific file or a directory
        if all_texts is None or len(all_texts) <= 0:
            print ext_print(
                'input data error, please check either no such file or no data --- interrupting'
            )
            return
        print ext_print('found a total of %d data items' % len(all_texts))

        output = []
        i = 0
        for texts in all_texts:
            if i % 1000 == 0:
                print ext_print('processing %d' % i)
            i += 1

            #             if str(texts[0])<>'NCT00002967':
            #                 continue
            inclusive = texts[5].lower()
            inclusive = inclusive[0:inclusive.find('exclusi')]
            #            combine_texts = texts[2].lower() + ". " + texts[3].lower() + ". " + texts[4].lower() + ". " + inclusive
            combine_texts = texts[3].lower() + ". " + texts[4].lower(
            ) + ". " + inclusive
            pre_label = 'Biological ' + texts[1][0].upper() + texts[1][1:]
            result = GAXer_Ggender(combine_texts, pre_label)
            #            print result
            #            if len(result)==0 or (len(texts[1])>0 and len(result)==1 and pre_label in result):
            if len(result) == 0:
                continue
            else:
                t = texts[0]
                t = t.replace('"', '')
                t = str(t)
                output.append((t, texts[1], str(result)))

        # output result
        if (fout is None) or (fout == ""):
            fout = os.path.splitext(fdin)[0] + "_gender.csv"

        ufile.write_csv(fout, output)

    print ext_print('saved processed results into: %s' % fout)

    print ext_print('all tasks completed\n')
    return True
コード例 #5
0
def compare_all(fin1, fdin2, fout1=None):
    # read input data
    if fin1 is None or fin1 == "":
        return False
    texts = ufile.read_file_tokenized(fin1,
                                      '\t')  # a specific file or a directory

    word_list = []
    for text in texts:
        sentence = text[0]  # get all sentences
        target_word = text[1]
        # get compact context window
        can_phrases = NLP_sent.phrase_splitting(sentence)
        words = []
        for phrase in can_phrases:
            all_words = NLP_word.word_splitting(phrase.lower())
            if target_word in all_words:
                all_words.remove(target_word)
                for word in all_words:
                    if word_checking_stop(word) == 0:
                        if word not in word_list:
                            word_list.append(word)
                break

    # get output data directory
    if fout1 is None:
        fout = os.path.splitext(fin1)[0] + "_wordFeatures.csv"
    ufile.write_csv(fout1, word_list)
    print 'saved result into: %s' % fout

    # read 1T corpus data
    if fdin2 is None or fdin2 == "":
        return False
    # judge a single file or a directory
    for root, dir, files in os.walk(fdin2):
        for filename in files:
            f = os.path.join(root, filename)
            print f
            New1T = []
            cur = 0
            fid = open(f, 'r')
            for line in fid:
                cur += 1
                if (cur % 1000000 == 0):
                    print filename, cur
                line = line.strip().lower()
                if len(line) > 0:
                    tem = line.split('\t')
                    tem1 = tem[0].split(' ')
                    for tem_word in tem1:
                        if word_checking_speical(tem_word) > 0:
                            break
                        if tem_word in word_list:
                            New1T.append(line)
                            break
            fid.close()
            # get output data directory
            fout2 = fdin2 + "_" + filename
            ufile.write_file(fout2, New1T, False)
            print 'saved result into: %s' % fout2

    print 'all tasks completed\n'
    return True