Beispiel #1
0
def clean_tech_lst(top_n=5):
    """ Clean up technology list of challenges."""
    tc = TopCoder()  # trying to reduce the length of variable name here
    filt_cha_info = tc.get_filtered_challenge_basic_info(
    )  # it's readable for me anyway ;-)
    most_popular_tech = tc.get_tech_popularity().head(
        top_n).tech_name.to_list()

    with open(os.path.join(os.curdir, 'data', 'tech_by_challenge.json')) as f:
        tech_by_cha_rough = {
            cha['challenge_id']: cha['tech_lst']
            for cha in json.load(f)
            if cha['challenge_id'] in filt_cha_info.index
        }

    print(f'Top {top_n} most popular technologies', most_popular_tech)

    tech_by_cha = []
    for cha_id, tech_lst in tech_by_cha_rough.items():
        cleaned_tech_lst = [
            'angularjs' if 'angular' in tech.lower() else tech.lower()
            for tech in tech_lst
        ]
        filtered_tech_lst = [
            tech for tech in cleaned_tech_lst if tech in most_popular_tech
        ]
        if filtered_tech_lst:
            tech_by_cha.append({
                'challenge_id': cha_id,
                'tech_lst': filtered_tech_lst
            })

    print(f'Challenge with tech after filtering: {len(tech_by_cha)}')

    with open(os.path.join(os.curdir, 'data', 'tech_by_challenge_clean.json'),
              'w') as f:
        json.dump(tech_by_cha, f, indent=4)
    Train KNN model with hand pick challenge by the price range
"""
import os
import json
from collections import defaultdict

import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

from tc_main import TopCoder

TOPCODER = TopCoder()
DOC_VEC_SIZE = 100 # choose this dimensionality based on empirical reason
DOC_VEC_PATH = os.path.join(os.curdir, 'pricing_model_0', 'develop_track', 'document_vec', f'document_vec_{DOC_VEC_SIZE}D.json')

HANDPICKED_CHALLENGES = TOPCODER.get_handpick_dev_cha_id()

def get_challenge_meta_data():
    """ Return challenge meta data of handpicked challenges."""
    cbi_df = TOPCODER.challenge_basic_info.loc[TOPCODER.challenge_basic_info.index.isin(HANDPICKED_CHALLENGES)]

    challenge_duration = (cbi_df.submission_end_date - cbi_df.registration_start_date).apply(lambda td: td.days)

    meta_data = pd.concat(
        [
            cbi_df.reindex(['subtrack'], axis=1).astype('category').apply(lambda c: c.cat.codes),
            cbi_df.reindex(['number_of_platforms'], axis=1),
"""

import os
import json
from collections import defaultdict

import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

from tc_main import TopCoder

TOPCODER = TopCoder()
ACTUAL_PRIZE = TOPCODER.challenge_basic_info.total_prize[TOPCODER.challenge_basic_info.total_prize != 0]

def get_path_by_track_and_dimension(track, dimension):
    """ Get document vector path by track and doc vec dimension."""
    return os.path.join(os.curdir, 'pricing_model_0', f'{track}_track', 'document_vec', f'document_vec_{dimension}D.json')

def get_path_handpick_challenge(no_overlap, with_phrase):
    """ Get document vector path from pricing model 4"""
    return os.path.join(os.curdir, 'pricing_model_4', 'document_vec', f'document_vec_{str(no_overlap)[0]}{str(with_phrase)[0]}_600D.json')

def get_challenge_meta_data():
    """ Return challenge meta data in pandas DataFrame."""
    cha_basic_info = TOPCODER.challenge_basic_info
    challenge_duration = (cha_basic_info.submission_end_date - cha_basic_info.registration_start_date).apply(lambda td: td.days)
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler

from tc_main import TopCoder

PP_PATH = os.path.join(os.curdir, 'pricing_model_6', 'preprocess_data')
PP_DATA = {
    'splt_cha': os.path.join(PP_PATH, 'split_challenges.json'),
}

XY_PATH = {
    'X': os.path.join(os.curdir, 'pricing_model_6', 'round1', 'X_{}.json'),
    'y': os.path.join(os.curdir, 'pricing_model_6', 'round1', 'y_{}.json')
}
RESULT_PATH = os.path.join(os.curdir, 'pricing_model_6', 'round1_res')

TC = TopCoder()
FILT_CHA_INFO = TC.get_filtered_challenge_basic_info()

def split_challenges():
    """ Split challenges into 10 equal part randomly, with proportionally divided challenges by subtrack.
        It's randomly splited but consistant with a fixed random_state param.
    """
    cha_id_sr = pd.Series(FILT_CHA_INFO.index)
    split_cha_id = [splt_ids.to_list() for splt_ids in np.array_split(cha_id_sr.sample(frac=1, random_state=0), 10)]
    with open(PP_DATA['splt_cha'], 'w') as fwrite:
        json.dump(split_cha_id, fwrite, indent=4)

def get_train_test_Xy(X: pd.DataFrame, y: pd.DataFrame, chunk_idx: int):
    """ Get train X, test X, train y, test y for given chunk of challenge ids."""
    with open(PP_DATA['splt_cha']) as fread:
        split_cha_id = json.load(fread)
import os
import json
import itertools
from datetime import datetime

import numpy as np
import pandas as pd

from tc_main import TopCoder
from tc_pricing_models import cosine_similarity

DATA_PATH = os.path.join(os.curdir, 'pricing_model_6',
                         'training_data_segments')
TRAINING_DATA_PATH = os.path.join(os.curdir, 'pricing_model_6', 'round1')

TOPCODER = TopCoder()
FILTERED_CHALLENGE_INFO = TOPCODER.get_filtered_challenge_basic_info()
CHALLENGE_ID_COMBINATION = lambda: itertools.combinations(
    FILTERED_CHALLENGE_INFO.index, 2)

SUBTRACK_COMB = [
    sorted(subtrack_comb)
    for subtrack_comb in itertools.combinations_with_replacement(
        FILTERED_CHALLENGE_INFO.subtrack.unique(), 2)
]
TECH_COMB = \
    [sorted(tech_comb) for tech_comb in itertools.combinations_with_replacement(TOPCODER.get_tech_popularity().head(5).tech_name, 2)] +\
    TOPCODER.get_tech_popularity().head(5).tech_name.to_list()

TOP5_SUBTRACK = list(
    FILTERED_CHALLENGE_INFO.subtrack.value_counts().sort_values(
Beispiel #6
0
""" Build pricing model with random forest."""
import os
import json
import pickle
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier

from tc_main import TopCoder
from pair_cha_imbl_learning import PP_DATA

TOPCODER = TopCoder()
FILT_CHA_INFO = TOPCODER.get_filtered_challenge_basic_info()

# Round 1


def prz_estimation_from_prob(y_prob_path, target_ids: list):
    """ Estimate challenge prize from top most confident predictions."""
    prob_df = pd.read_json(y_prob_path,
                           orient='records').set_index(['l0', 'l1'])

    prz_estimation = []
    for cha_id in target_ids:
        cha_pair = prob_df.loc[
            (prob_df.index.get_level_values(0) == cha_id) |
            (prob_df.index.get_level_values(1) == cha_id)].copy()
        cha_pair.index = cha_pair.index.map(