def convert_to_trainable_format(title, title_transform_func, feature_extractor, **kwargs):
    """
    Given some title(before capitalization), return the trainable(for CRF-suite) format
    
    >>> from cap_transform import make_capitalized_title
    >>> from capitalization_restoration.feature_extractor import FeatureExtractor
    >>> extractor = FeatureExtractor()
    >>> sent = convert_to_trainable_format(u"Why oil prices will be 'robust' long-term: Shell CEO", make_capitalized_title, extractor, docpath="test_data/oil-price")
    >>> sent[2]["word"]
    u'Prices'
    >>> sent[5]["lower-in-dict"]
    False
    >>> sent[1]["y"]
    'AL'
    """
    if isinstance(title, list):
        words = title
    else:
        words = nltk.word_tokenize(title)

    transformed_words = title_transform_func(title_words = words)

    words_with_features = feature_extractor.extract(transformed_words, **kwargs)

    #add the labels
    for word_str, word_info in zip(words, words_with_features):
        word_info["y"] = get_label(word_str)

    return words_with_features
def convert_to_trainable_format(title, title_transform_func, feature_extractor,
                                **kwargs):
    """
    Given some title(before capitalization), return the trainable(for CRF-suite) format
    
    >>> from cap_transform import make_capitalized_title
    >>> from capitalization_restoration.feature_extractor import FeatureExtractor
    >>> extractor = FeatureExtractor()
    >>> sent = convert_to_trainable_format(u"Why oil prices will be 'robust' long-term: Shell CEO", make_capitalized_title, extractor, docpath="test_data/oil-price")
    >>> sent[2]["word"]
    u'Prices'
    >>> sent[5]["lower-in-dict"]
    False
    >>> sent[1]["y"]
    'AL'
    """
    if isinstance(title, list):
        words = title
    else:
        words = nltk.word_tokenize(title)

    transformed_words = title_transform_func(title_words=words)

    words_with_features = feature_extractor.extract(transformed_words,
                                                    **kwargs)

    #add the labels
    for word_str, word_info in zip(words, words_with_features):
        word_info["y"] = get_label(word_str)

    return words_with_features
Example #3
0
def get_lebel():
    try:
        data = request.json
        userId = data.get(USERID)
        taskId = data.get(TASKID)
        taskType = data.get(TASKTYPE)
        labelCount = data.get("labelCount")
        if taskId == None:
            return return_failed_with_msg("No taskId")
            # return jsonify("No taskId")
        result_label = Label.get_label(userId, taskId, taskType, labelCount)
        if taskType == CLASSIFICATION:
            result = {
                TASKID: taskId,
                TASKTYPE: taskType,
                "labelList": result_label
            }
            print(result)
            return Respond.return_success_with_data(result)
        if taskType == NER:
            result = {
                TASKID: taskId,
                TASKTYPE: taskType,
                "label": result_label
            }
            return Respond.return_success_with_data(result)
    except:
        return Respond.return_failed()
def make_capitalized_title(title=None, title_words=None):
    trans_words = []
    if title_words:
        words = title_words
    elif title:
        words = nltk.word_tokenize(title)
    else:
        raise ValueError("Receive nothing..")

    for i, word in enumerate(words):
        if get_label(word) == 'MX':
            trans_words.append(word)
        elif i == 0:
            trans_words.append(
                word if word[0] == word[0].upper() else word.capitalize()
            )
        elif (word.lower() in ARTICLES or
              word.lower() in PREPOSITIONS or
              word.lower() in CONJUNCTIONS):
            trans_words.append(word.lower())
        elif word[0] == word[0].upper():  # already capitalized
            trans_words.append(word)
        else:
            trans_words.append(word.capitalize())
    return trans_words
def load_labeled_data(path):
    """

    >>> d = load_labeled_data(path = "fnames_and_titles.txt")
    >>> d.next()[:8]
    [(u'The', 'IC'), (u'Sun', 'IC'), (u'Life', 'IC'), (u'Building', 'IC'), (u'receives', 'AL'), (u'LEED', 'AU'), (u'Silver', 'IC'), (u'Certification', 'IC')]
    """
    with open(path, "r", "utf8") as f:
        for line in f:
            _, title = json.loads(line)
            words = nltk.word_tokenize(title)
            yield [(w, get_label(w)) for w in words]
def load_labeled_data(path):
    """

    >>> d = load_labeled_data(path = "fnames_and_titles.txt")
    >>> d.next()[:8]
    [(u'The', 'IC'), (u'Sun', 'IC'), (u'Life', 'IC'), (u'Building', 'IC'), (u'receives', 'AL'), (u'LEED', 'AU'), (u'Silver', 'IC'), (u'Certification', 'IC')]
    """
    with open(path, "r", "utf8") as f:
        for line in f:
            _, title = json.loads(line)
            words = nltk.word_tokenize(title)
            yield [(w, get_label(w)) for w in words]
Example #7
0
else:

	test_label_begin_date = datetime(2017, 5, 1)
	test_label_end_date, test_feature_begin_dates, test_feature_end_date = get_dates(test_label_begin_date)

	train_label_begin_date = datetime(2017, 4, 1)
	train_label_end_date, train_feature_begin_dates, train_feature_end_date = get_dates(train_label_begin_date)


########## FEATURE EXTRACTION ##########


# get training feature and label
train_feature = get_feature(data, train_feature_begin_dates, train_feature_end_date, featured_month_periods)
train_label = get_label(data, train_label_begin_date, train_label_end_date)

# get test feature
test_feature = get_feature(data, test_feature_begin_dates, test_feature_end_date, featured_month_periods)


########## MODEL TRAINING ##########

x_train = train_feature.drop('user_id', axis=1)
y_train = train_label.drop('user_id', axis=1)
x_test = test_feature.drop('user_id', axis=1)

model_params = {
  'task': 'train',
  'boosting_type': 'gbdt',
  'objective': 'regression',
Example #8
0
        acc, true_positives, real_positives, predicted_positives = utils.calc_acc_f1(
            label_all, pred_all, 0.5)

        fout = open('log.txt', 'a+', encoding='utf-8')
        fout.write('\n' + '*' * 20 + '\n')
        fout.write('acc:' + str(acc) + '\n')
        fout.write('true_positives:' + str(true_positives) + '\n')
        fout.write('real_positives:' + str(real_positives) + '\n')
        fout.write('predicted_positives:' + str(predicted_positives) + '\n')
        fout.close()
        #        acc, true_positives, real_positives, predicted_positives = utils.calc_acc_f1(target, output, 0.5)
        print('f1:%.4f' % (f1))

    if (args.command == "check"):

        filenameslist, filelabelslist = get_label()
        pred = np.zeros(filelabelslist.shape)
        if torch.cuda.is_available():
            label_all = torch.Tensor().cuda()
            pred_all = torch.Tensor().cuda()
        else:
            label_all = torch.Tensor()
            pred_all = torch.Tensor()
        for i in range(5):
            config.train_data = 'path/train'
            args.fold = i
            target, output, filename = check(args)
            idx = []
            for tmp_name in filename:
                idx.append(filenameslist.index(tmp_name))
            pred[idx] += np.round(output.cpu().detach().numpy())