Beispiel #1
0
    ## Pass the ML classifier through to the parse SUTime entities method.

    ## Loop through each file and parse
    for f in range(0, len(infiles)):
        print("Parsing " + infiles[f] + " ...")
        ## Init the ChronoEntity list
        my_chronoentities = []
        my_chrono_ID_counter = 1

        ## parse out the doctime
        doctime = utils.getDocTime(infiles[f] + ".dct")
        if (debug): print(doctime)

        ## parse out reference tokens
        text, tokens, spans, tags, sents = utils.getWhitespaceTokens(
            infiles[f] + args.x)
        #my_refToks = referenceToken.convertToRefTokens(tok_list=tokens, span=spans, remove_stopwords="./Chrono/stopwords_short2.txt")
        my_refToks = referenceToken.convertToRefTokens(tok_list=tokens,
                                                       span=spans,
                                                       pos=tags,
                                                       sent_boundaries=sents)

        ## mark all ref tokens if they are numeric or temporal
        chroList = utils.markTemporal(my_refToks)

        if (debug):
            print("REFERENCE TOKENS:\n")
            for tok in chroList:
                print(tok)

        tempPhrases = utils.getTemporalPhrases(chroList, doctime)
def createMLTrainingMatrix(infiles,
                           gold_folder,
                           ext="",
                           save=False,
                           output="aquaint_train",
                           window=3):
    ### Algorithm
    ## For each input file:
    ##      1) parse text to refTokens list
    ##      2) parse SUTime to identify temporal tokens
    ##      3) Import gold standard file
    ##      4) Get list of periods and intervals with start and end coords
    ##      5) For each period/interval:
    ##          - Create feature vector
    ##          - Save features to global list
    ##      6) Write gold features to a csv file for import by other scripts to train ML methods.

    ## define list of dictionary feature vectors
    obs_list = []  ### This is the list of features for each observation
    category = [
    ]  ### This is the category of the observation.  1 for period, 0 otherwise. Note that the unknowns are being grouped in with the calendar-interval category.  probably need to parse that out later or change up the algorithm to not be a binary classifier.

    features = {
        'feat_numeric': 0,
        'feat_temp_context': 0,
        'feat_temp_self': 0
    }  ### This is the full list of features.  I will use the key values to get the individual feature vectors.

    if (save):
        outfile = open("./gold-standard-parsing.txt", 'w')

    ## Loop through each file and parse
    for f in range(0, len(infiles)):
        print("ML Parsing " + infiles[f] + " ...")

        ## parse out the doctime
        doctime = utils.getDocTime(infiles[f] + ".dct")
        if (debug): print(doctime)

        ## parse out reference tokens
        text, tokens, spans, tags = utils.getWhitespaceTokens(infiles[f] + ext)
        my_refToks = referenceToken.convertToRefTokens(tok_list=tokens,
                                                       span=spans,
                                                       pos=tags)

        ## mark all ref tokens if they are numeric or temporal
        chroList = utils.markTemporal(my_refToks)

        ## import gold standard data
        gold_file = os.path.join(gold_folder,
                                 os.path.split(infiles[f])[1],
                                 "period-interval.gold.csv")
        gold_list = []

        if not os.path.exists(gold_file):
            print(gold_file + " DOES NOT EXISTS")
            break

        if os.path.exists(gold_file):
            if (save):
                outfile.write("\n$$$$$$$$$$$\nProcessing: " + gold_file)
            with open(gold_file) as file:
                reader = csv.DictReader(file)
                for row in reader:
                    gold_list.append({
                        'type': row['type'],
                        'start': row['start'],
                        'end': row['end'],
                        'value': row['value']
                    })
                    if (save):
                        outfile.write("\n" + str(row))

            ## loop through each reftoken term and see if it overlaps with a gold token
            for r in range(0, len(chroList)):
                reftok = chroList[r]
                ref_s, ref_e = reftok.getSpan()
                # loop through each gold instance and find the one that overlaps with the current reftok.
                for g in gold_list:
                    # print(str(g))
                    if utils.overlap(
                        [ref_s, ref_e],
                        [int(g['start']), int(g['end'])]):
                        this_obs = {}
                        # if the gold token overlaps with the current reftok we need to extract the features from the reftok and add it to the list

                        if (save):
                            outfile.write("\nPrevious Token: " +
                                          str(chroList[max(r - 1, 0)]))
                            outfile.write("\nTarget Token: " + str(reftok))
                            #print("Length: "+ str(len(my_refToks)) + "Last: "+str(min(r+1, len(my_refToks))))
                            outfile.write(
                                "\nNext Token: " +
                                str(chroList[min(r + 1,
                                                 len(my_refToks) - 1)]) + "\n")

                        ### Identify Temporal features
                        this_obs = extract_temp_features(
                            chroList, r, 3, this_obs)

                        ### Extract all words within a N-word window
                        this_obs, observations = extract_bow_features(
                            chroList, r, window, features, this_obs)

                        ### Determine if there is a numeric before or after the target word.
                        this_obs = extract_numeric_feature(
                            chroList, r, this_obs)

                        ### Stem and extract the actual word
                        this_obs, observations = extract_stem_feature(
                            chroList[r], features, this_obs)

                        ### Get the correct type
                        if (g['type'] == 'Period'):
                            category.append(1)
                        else:
                            category.append(0)

                        obs_list.append(this_obs)

    ## Ok, I have all the features.  Now I just need to put them all together in a matrix.
    print("features length: " + str(len(features.keys())))
    print("obs_list length: " + str(len(obs_list)))
    print("category length: " + str(len(category)))

    ## Now I need to loop through the obs_list to create a list of features that contain all feature elements.
    full_obs_list = []  # a list of tuples
    for i in range(0, len(obs_list)):
        feats = deepcopy(features)
        feats.update(obs_list[i])
        #full_obs_list.append((feats, category[i]))
        full_obs_list.append(feats)

    ## Now print the list of tuples to a file, then return the list.
    keys = full_obs_list[0].keys()
    with open(output + '_data.csv', 'w') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(full_obs_list)

    with open(output + '_class.csv', 'w') as output_file:
        for c in category:
            output_file.write("%s\n" % c)

    ### Now return the feature list and the categories
    return (full_obs_list, category)