Beispiel #1
0
def main():
    print("Reading csv files")
    dataset = read_all_DS()

    trainset = pd.read_csv('dataRev2/Train.csv')
    train_confirmed = trainset[['AuthorId', 'ConfirmedPaperIds']].rename(columns = {'ConfirmedPaperIds':'PaperIds'})
    train_deleted = trainset[['AuthorId', 'DeletedPaperIds']].rename(columns = {'DeletedPaperIds':'PaperIds'})
    validset = pd.read_csv('dataRev2/Valid.csv')
    testset = pd.read_csv('dataRev2/Test.csv')

    allsets = pd.concat([train_confirmed, validset, testset])
    all_dups = make_duplicates_from_targets(allsets)
    dataset['all_duplicates'] = all_dups


    print("Getting features for confirmed papers")
    features_conf = get_features(dataset, train_confirmed)

    print("Getting features for deleted papers")
    features_deleted = get_features(dataset, train_deleted)

    print("Getting features for valid papers")
    features_valid = get_features(dataset, validset)

    pickle.dump(features_deleted, open(data_io.get_paths()["deleted_features"], 'wb'))
    pickle.dump(features_conf, open(data_io.get_paths()["confirmed_features"], 'wb'))
    pickle.dump(features_valid, open(data_io.get_paths()["valid_features"], 'wb'))

    print("Getting features for test papers")
    features_test = get_features(dataset, testset)
    pickle.dump(features_test, open(data_io.get_paths()["test_features"], 'wb'))
Beispiel #2
0
def process_train_samples(samples, max_srch_size=10, each_saved_size=1000000):
    sorted_samples = samples.sorted_values(by=["srch_id"])
    sorted_samples = sorted_samples.reset_index(drop=True)
    samples_in_one_srch = pd.DataFrame()
    for r_idx, sample in sorted_samples.iterrows():
        if (r_idx + 1) % 1000 == 0:
            print "Processed %i sample of %i" % (r_idx + 1,
                                                 sorted_samples.shape[0])
        is_next_in_same_search = True
        samples_in_one_srch = pd.concat(
            (sample.to_frame().transpose(), samples_in_one_srch), axis=0)
        current_srch_id = sample["srch_id"]
        if (r_idx + 1) == sorted_samples.shape[0]:
            is_next_in_same_search == False
        else:
            next_srch_id = sorted_samples["srch_id"][r_idx + 1]
            if current_srch_id != next_srch_id:
                is_next_in_same_search = False
        if not is_next_in_same_search:
            ext_samples_in_one_srch = extract_features(samples_in_one_srch)
            n_samples = ext_samples_in_one_srch.shape[0]
            if n_samples > max_srch_size:
                if np.any(ext_samples_in_one_srch["bookings_bool"]):
                    pos_samples = ext_samples_in_one_srch[
                        ext_samples_in_one_srch["booking_bool"] == 1]
                    neg_samples = ext_samples_in_one_srch[
                        ext_samples_in_one_srch["booking_bool"] == 0]
                    selected_neg_samples = neg_samples.samples(
                        n=max_srch_size - pos_samples.shape[0])
                    selected_samples = pd.concat(
                        (pos_samples, selected_neg_samples), axis=0)
                else:
                    selected_samples = ext_samples_in_one_srch.sample(
                        n=max_srch_size)
            else:
                selected_samples = ext_samples_in_one_srch.copy()
            processed_samples = pd.concat(
                (processed_samples, selected_samples), axis=0)
            samples_in_one_srch = pd.DataFrame()
        if (r_idx + 1) % each_saved_size == 0:
            save_file_name = "proc_train_samples_%i.csv" % (r_idx + 1)
            save_path = get_paths()["proc_train_path"]
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            if np.any(np.isnan(processd_samples.values)):
                processd_samples = processd_samples.fillna(value=0)
            processd_samples.to_csv(os.path.join(save_path, save_file_name),
                                    index=None)
            save_file_name = "proc_train_samples%i.csv" % (r_idx + 1)
            save_path = get_paths()["proc_train_path"]
            if np.any(np.isnan(processd_samples.values)):
                processd_samples = processd_samples.fillna(value=0)
            processd_samples.to_csv(os.path.join(save_path, save_file_name),
                                    index=None)
Beispiel #3
0
    def parse_papers(self):
        # Create Papers
        print "Parsing Papers..."
        f = open(data_io.get_paths()["paper_processed_path"], "r")
        titles = f.readline()
        for l in f.readlines():
            res = l.strip().split(",")
            paper_title = unidecode.unidecode(unicode(res[1],
                                                      encoding="utf-8"))
            title_words = nlp.filter_paper_title(paper_title)
            paper_keyword = unidecode.unidecode(
                unicode(res[5], encoding="utf-8"))
            filtered_keyword = nlp.filter_paper_keyword(paper_keyword)
            self.papers[int(res[0])] = paper.Paper(int(res[0]), title_words,
                                                   int(res[2]), int(res[3]),
                                                   int(res[4]),
                                                   filtered_keyword)

            for tt in title_words.split():
                try:
                    self.paper_titles[tt] = self.paper_titles[tt] + 1
                except:
                    self.paper_titles[tt] = 1

        print "Done"
        f.close()
def classify_catagory(train, test):
    print("Train-test split")
    trainX, testX, trainY, testY = train_test_split(train,
                                                    test,
                                                    random_state=1)
    print "TrainX size = ", str(trainX.shape)
    print "TestX size = ", str(testX.shape)

    classifier = GradientBoostingClassifier(n_estimators=1024,
                                            random_state=1,
                                            subsample=.8,
                                            min_samples_split=10,
                                            max_depth=6,
                                            verbose=3)
    classifier.fit(trainX, trainY)
    print "Score = ", classifier.score(testX, testY)

    feature_importrance = classifier.feature_importances_
    logger = open(data_io.get_paths()["feature_importance_path"], "a")
    for fi in feature_importrance:
        logger.write(str(fi))
        logger.write("\n")
    logger.write("###########################################\n")
    logger.close()

    return classifier
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'add.noise.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Additive noise model AB', ['A','B'], f.add_noise_model_AB),
                ('Additive noise model BA', ['A','B'], f.add_noise_model_BA)]
                
    feature_names = [name for (name, dummy1, dummy2) in features]

    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)

    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)

    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)

    print("Writing feature file")
    data_io.write_real_features('add_noise', all_features, feature_names)
    def parse_authors(self):
        # Create authors
        print "Parsing Authors..."
        f = open(data_io.get_paths()["author_processed_path"], "r")
        titles = f.readline()
        for l in f.readlines():
            res = l.strip().split(",")
            # Titles
            raw_title = unidecode.unidecode(unicode(res[1], encoding="utf-8"))
            (name, surname) = nlp.filter_title(raw_title)
            try:
                self.surnames[surname] = self.surnames[surname] + 1
            except:
                self.surnames[surname] = 1

            #Affiliations
            raw_affiliation = unidecode.unidecode(unicode(res[2], encoding="utf-8"))
            affiliation = nlp.filter_affiliation(raw_affiliation)
            try:
                self.affiliations[affiliation] = self.affiliations[affiliation] + 1
            except:
                self.affiliations[affiliation] = 1
            self.authors[int(res[0])] = author.Author(int(res[0]), name, surname, affiliation)

        print "Done"
        f.close()
Beispiel #7
0
    def parse_authors(self):
        # Create authors
        print "Parsing Authors..."
        f = open(data_io.get_paths()["author_processed_path"], "r")
        titles = f.readline()
        for l in f.readlines():
            res = l.strip().split(",")
            # Titles
            raw_title = unidecode.unidecode(unicode(res[1], encoding="utf-8"))
            (name, surname) = nlp.filter_title(raw_title)
            try:
                self.surnames[surname] = self.surnames[surname] + 1
            except:
                self.surnames[surname] = 1

            #Affiliations
            raw_affiliation = unidecode.unidecode(
                unicode(res[2], encoding="utf-8"))
            affiliation = nlp.filter_affiliation(raw_affiliation)
            try:
                self.affiliations[
                    affiliation] = self.affiliations[affiliation] + 1
            except:
                self.affiliations[affiliation] = 1
            self.authors[int(res[0])] = author.Author(int(res[0]), name,
                                                      surname, affiliation)

        print "Done"
        f.close()
Beispiel #8
0
 def parse_paperauthors(self):
     # Update all journal/conference/coauthor information
     print "Parsing PaperAuthors..."
     f = open(data_io.get_paths()["paperauthor_processed_path"], "r")
     titles = f.readline()
     count = 0
     for l in f:
         count += 1
         if count % 100000 == 0:
             print count
         res = l.strip().split(",")
         if not res[0].isdigit():
             continue
         paper_id = int(res[0])
         author_id = int(res[1])
         raw_author_name = unidecode.unidecode(
             unicode(res[2], encoding="utf-8"))
         author_name = nlp.filter_title(raw_author_name)[0]
         raw_author_affiliation = unidecode.unidecode(
             unicode(res[3], encoding="utf-8"))
         author_affiliation = nlp.filter_affiliation(raw_author_affiliation)
         curr_paper = self.papers.get(paper_id)
         curr_author = self.authors.get(author_id)
         self.update_paperauthor(curr_paper, curr_author, author_id,
                                 author_name, author_affiliation)
     print "Done"
     f.close()
Beispiel #9
0
def train(f, file_path):
    file_pt = open(file_path, "r")
    title = file_pt.readline()
    ret = None
    for l in file_pt.readlines():
        res = l.split(",")
        fet = f.create_features_from_res(res)
        if ret == None:
            ret = fet
        elif fet != None:
            ret = numpy.vstack((ret, fet))
        print ret.shape


#  classifier = RandomForestClassifier(n_estimators=100,
#                                      verbose=2,
#                                      n_jobs=1,
#                                      min_samples_split=10,
#                                      random_state=1)

    classifier = GradientBoostingClassifier(n_estimators=512,
                                            verbose=3,
                                            max_depth=6,
                                            min_samples_split=10,
                                            subsample=0.8,
                                            random_state=1)

    valid_ret = validate(f, data_io.get_paths()["valid_sol_path"], classifier)
    ret = numpy.vstack((ret, valid_ret))
    print "Final size: ", ret.shape

    trainX, testX, trainY, testY = train_test_split(ret[:, 3:],
                                                    ret[:, 0],
                                                    random_state=1)

    classifier.fit(trainX, trainY)

    numpy.savetxt(data_io.get_paths()["feature_path"],
                  ret.astype(float),
                  fmt='%f',
                  delimiter=",")

    print classifier.score(testX, testY)
    #validate(f, data_io.get_paths()["valid_sol_path"], classifier)
    print classifier.score(valid_ret[:, 3:], valid_ret[:, 0])

    return classifier
def main():
    submission_path = data_io.get_paths()["submission_path"]
    reader = csv.reader(open(submission_path))
    reader.next() # skipping the header
    recommendations = [(int(row[0]), int(row[1]), -i)
        for i,row in enumerate(reader)]
    out_path = submission_path[:-4]+"Reversed.csv"
    data_io.write_submission(recommendations, submission_path=out_path)
def main():
    submission_path = data_io.get_paths()["submission_path"]
    reader = csv.reader(open(submission_path))
    reader.next()  # skipping the header
    recommendations = [(int(row[0]), int(row[1]), -i)
                       for i, row in enumerate(reader)]
    out_path = submission_path[:-4] + "Reversed.csv"
    data_io.write_submission(recommendations, submission_path=out_path)
Beispiel #12
0
def paper_keywords(data):
    paper = data['paper']
    paperid = list(paper["Id"])
    paper_keyword = defaultdict(list)

    paper = paper.set_index("Id")

    paper['Keyword'] = paper['Keyword'].fillna("")
    paper['Title'] = paper['Title'].fillna("")

    title = list(paper["Title"])
    cnt = 0
    start_time = time.time()
    titleTokens = []

    print("Start title!!!")
    for t in title:
        cnt += 1
        if (cnt % 100000 == 0):
            print("Count: ", cnt)
            print("Time: ", time.time() - start_time)
        titleTokens.append(tokenize(t))
    paper['Token'] = titleTokens

    #paper['Token'] = paper.Title.map(tokenize)
    print("Start keyword!!!")

    keywords = list(paper['Keyword'])
    cnt2 = 0
    keywordTokens = []
    for k in keywords:
        cnt2 += 1
        if (cnt2 % 100000 == 0):
            print("Count: ", cnt2)
            print("Time: ", time.time() - start_time)
        keywordTokens.append(filter_keyword(k))

    paper['Keyword_pro'] = keywordTokens

    print("Start concatenation!!!")

    #TODO: change all "apply", "map" functions to explicit for loop. Don't use "for loop" in the list because it causes memory limit error.
    #concatenate keyword and token
    keyToken = []
    for i in paperid:
        keyToken.append(
            list(set(paper.loc[i, 'Keyword_pro'] + paper.loc[i, 'Token'])))
    paper['Key_token'] = keyToken
    for i in paperid:
        paper_keyword[i] = paper.loc[i, 'Key_token']

    pickle.dump(paper_keyword,
                open(data_io.get_paths()["paper_title_tokens"], 'wb'))

    print("Process done, time: ", time.time() - start_time)

    return paper_keyword
def predict(f, classifier, file_path):
  file_pt = open(file_path, "r")
  title = file_pt.readline()
  output = open(data_io.get_paths()["submission_path"], "a")
  tot_fet = None
  for l in file_pt.readlines():
    res = l.split(",")
    fet = f.create_features_from_res(res)
    if tot_fet == None:
      tot_fet = fet
    else:
      tot_fet = numpy.vstack((tot_fet, fet))
    pred = classifier.predict_proba(fet[:, 3:])
    sorted_pred = sorted(zip(res[1].split(), pred[:, 1]), key=lambda a:a[1], reverse=True)
    #print sorted_pred
    output.write(res[0] + "," + " ".join(map(lambda a: a[0], sorted_pred)) + "\n")
  output.close()
  numpy.savetxt(data_io.get_paths()["test_feature_path"], tot_fet.astype(float), fmt='%f', delimiter=",")
def train(f, file_path):
  file_pt = open(file_path, "r")
  title = file_pt.readline()
  ret = None
  for l in file_pt.readlines():
    res = l.split(",")
    fet = f.create_features_from_res(res)
    if ret == None:
      ret = fet
    elif fet != None:
      ret = numpy.vstack((ret, fet))
    print ret.shape

#  classifier = RandomForestClassifier(n_estimators=100, 
#                                      verbose=2,
#                                      n_jobs=1,
#                                      min_samples_split=10,
#                                      random_state=1)

  classifier = GradientBoostingClassifier(n_estimators=512, 
                                          verbose=3,
                                          max_depth=6,
                                          min_samples_split=10,
                                          subsample=0.8,
                                          random_state=1)

  valid_ret = validate(f,  data_io.get_paths()["valid_sol_path"], classifier)
  ret = numpy.vstack( (ret, valid_ret) )
  print "Final size: ", ret.shape

  trainX, testX, trainY, testY = train_test_split(ret[:, 3:], ret[:, 0], random_state=1)

  classifier.fit(trainX, trainY)

  numpy.savetxt(data_io.get_paths()["feature_path"], ret.astype(float), fmt='%f', delimiter=",")

  print classifier.score(testX, testY)
  #validate(f, data_io.get_paths()["valid_sol_path"], classifier)
  print classifier.score(valid_ret[:, 3:], valid_ret[:, 0])

  return classifier
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'reasonable_features.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('A: Normalized Entropy', 'A', f.normalized_entropy),
                ('B: Normalized Entropy', 'B', f.normalized_entropy),
                ('Pearson R', ['A','B'], f.correlation),
                ('Pearson R Magnitude', 'derived', 'abs(output[key][2])'),# Apologies for this weird feature definition mechanism - it is a quick hack to prevent duplicated computation
                ('Entropy Difference', 'derived', 'output[key][0] - output[key][1]'),
                ('Entropy Ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'),
                ('Spearman rank correlation', ['A','B'], f.rcorrelation),
                ('Spearman rank magnitude', 'derived', 'abs(output[key][6])'),
                ('Kurtosis A', 'A', f.fkurtosis),
                ('Kurtosis B', 'B', f.fkurtosis),
                ('Kurtosis difference', 'derived', 'output[key][8] - output[key][9]'),
                ('Kurtosis ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'),
                ('Unique ratio A', 'A', f.unique_ratio),
                ('Unique ratio B', 'B', f.unique_ratio),
                ('Skew A', 'A', f.fskew),
                ('Skew B', 'B', f.fskew),
                ('Skew difference', 'derived', 'output[key][14] - output[key][15]'),
                ('Skew ratio', 'derived', 'output[key][14] / output[key][15] if not output[key][15] == 0 else output[key][14] / 0.000001'),
                ('Pearson - Spearman', 'derived', 'output[key][2] - output[key][6]'),
                ('Abs Pearson - Spearman', 'derived', 'output[key][3] - output[key][7]'),
                ('Pearson / Spearman', 'derived', 'output[key][2] / output[key][6] if not output[key][6] == 0 else output[key][2] / 0.000001')]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('reasonable_features', all_features, feature_names)
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'high_order_moments.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Moment 5 A', 'A', f.standard_moment_5),
                ('Moment 5 B', 'B', f.standard_moment_5),
                ('Moment 5 diff', 'derived', 'output[key][0] - output[key][1]'),
                ('Moment 5 ratio', 'derived', 'output[key][0] / output[key][1] if not output[key][1] == 0 else output[key][0] / 0.000001'),
                ('Moment 6 A', 'A', f.standard_moment_6),
                ('Moment 6 B', 'B', f.standard_moment_6),
                ('Moment 6 diff', 'derived', 'output[key][4] - output[key][5]'),
                ('Moment 6 ratio', 'derived', 'output[key][4] / output[key][5] if not output[key][5] == 0 else output[key][4] / 0.000001'),
                ('Moment 7 A', 'A', f.standard_moment_7),
                ('Moment 7 B', 'B', f.standard_moment_7),
                ('Moment 7 diff', 'derived', 'output[key][8] - output[key][9]'),
                ('Moment 7 ratio', 'derived', 'output[key][8] / output[key][9] if not output[key][9] == 0 else output[key][8] / 0.000001'),
                ('Moment 8 A', 'A', f.standard_moment_8),
                ('Moment 8 B', 'B', f.standard_moment_8),
                ('Moment 8 diff', 'derived', 'output[key][12] - output[key][13]'),
                ('Moment 8 ratio', 'derived', 'output[key][12] / output[key][13] if not output[key][13] == 0 else output[key][12] / 0.000001'),
                ('Moment 9 A', 'A', f.standard_moment_9),
                ('Moment 9 B', 'B', f.standard_moment_9),
                ('Moment 9 diff', 'derived', 'output[key][16] - output[key][17]'),
                ('Moment 9 ratio', 'derived', 'output[key][16] / output[key][17] if not output[key][17] == 0 else output[key][16] / 0.000001')]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('high_order_moments', all_features, feature_names)
 def parse_conferences(self):
     print "Parsing Conferences..."
     f = open(data_io.get_paths()["conference_processed_path"], "r")
     titles = f.readline()
     for l in f.readlines():
         res = l.strip().split(",")
         conference_id = int(res[0])
         raw_conference_title = unidecode.unidecode(unicode(res[2], encoding="utf-8"))
         conference_title = nlp.filter_paper_title(raw_conference_title)
         self.conferences[conference_id] = conference_title
         for c in conference_title.split():
             if c in self.conference_freq.keys():
                 self.conference_freq[c] = self.conference_freq[c] + 1
             else:
                 self.conference_freq[c] = 1
 def parse_journals(self):
     print "Parsing Journals..."
     f = open(data_io.get_paths()["journal_processed_path"], "r")
     titles = f.readline()
     for l in f.readlines():
         res = l.strip().split(",")
         journal_id = int(res[0])
         raw_journal_title = unidecode.unidecode(unicode(res[2], encoding="utf-8"))
         journal_title = nlp.filter_paper_title(raw_journal_title)
         self.journals[journal_id] = journal_title
         for j in journal_title.split():
             if j in self.journal_freq.keys():
                 self.journal_freq[j] = self.journal_freq[j] + 1
             else:
                 self.journal_freq[j] = 1
Beispiel #19
0
def predict(f, classifier, file_path):
    file_pt = open(file_path, "r")
    title = file_pt.readline()
    output = open(data_io.get_paths()["submission_path"], "a")
    tot_fet = None
    for l in file_pt.readlines():
        res = l.split(",")
        fet = f.create_features_from_res(res)
        if tot_fet == None:
            tot_fet = fet
        else:
            tot_fet = numpy.vstack((tot_fet, fet))
        pred = classifier.predict_proba(fet[:, 3:])
        sorted_pred = sorted(zip(res[1].split(), pred[:, 1]),
                             key=lambda a: a[1],
                             reverse=True)
        #print sorted_pred
        output.write(res[0] + "," +
                     " ".join(map(lambda a: a[0], sorted_pred)) + "\n")
    output.close()
    numpy.savetxt(data_io.get_paths()["test_feature_path"],
                  tot_fet.astype(float),
                  fmt='%f',
                  delimiter=",")
Beispiel #20
0
 def parse_conferences(self):
     print "Parsing Conferences..."
     f = open(data_io.get_paths()["conference_processed_path"], "r")
     titles = f.readline()
     for l in f.readlines():
         res = l.strip().split(",")
         conference_id = int(res[0])
         raw_conference_title = unidecode.unidecode(
             unicode(res[2], encoding="utf-8"))
         conference_title = nlp.filter_paper_title(raw_conference_title)
         self.conferences[conference_id] = conference_title
         for c in conference_title.split():
             if c in self.conference_freq.keys():
                 self.conference_freq[c] = self.conference_freq[c] + 1
             else:
                 self.conference_freq[c] = 1
Beispiel #21
0
 def parse_journals(self):
     print "Parsing Journals..."
     f = open(data_io.get_paths()["journal_processed_path"], "r")
     titles = f.readline()
     for l in f.readlines():
         res = l.strip().split(",")
         journal_id = int(res[0])
         raw_journal_title = unidecode.unidecode(
             unicode(res[2], encoding="utf-8"))
         journal_title = nlp.filter_paper_title(raw_journal_title)
         self.journals[journal_id] = journal_title
         for j in journal_title.split():
             if j in self.journal_freq.keys():
                 self.journal_freq[j] = self.journal_freq[j] + 1
             else:
                 self.journal_freq[j] = 1
Beispiel #22
0
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'icgi.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('ICGI entropy AB', ['A','B'], f.icgi_entropy_AB),
                ('ICGI entropy BA', ['A','B'], f.icgi_entropy_BA),
                ('ICGI entropy diff', 'derived', 'output[key][0] - output[key][1]'),
                ('ICGI slope AB', ['A','B'], f.icgi_slope_AB),
                ('ICGI slope BA', ['A','B'], f.icgi_slope_BA),
                ('ICGI slope diff', 'derived', 'output[key][3] - output[key][4]')]#,
                #('ICGI entropy AB PIT', ['A','B'], f.icgi_entropy_AB_PIT),
                #('ICGI entropy BA PIT', ['A','B'], f.icgi_entropy_BA_PIT),
                #('ICGI entropy diff PIT', 'derived', 'output[key][6] - output[key][7]'),
                #('ICGI slope AB PIT', ['A','B'], f.icgi_slope_AB_PIT),
                #('ICGI slope BA PIT', ['A','B'], f.icgi_slope_BA_PIT),
                #('ICGI slope diff PIT', 'derived', 'output[key][9] - output[key][10]')]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('icgi', all_features, feature_names)
 def parse_papers(self):
     # Create Papers
     print "Parsing Papers..."
     f = open(data_io.get_paths()["paper_processed_path"], "r")
     titles = f.readline()
     for l in f.readlines():
         res = l.strip().split(",")
         paper_title = unidecode.unidecode(unicode(res[1], encoding="utf-8"))
         title_words = nlp.filter_paper_title(paper_title)
         paper_keyword = unidecode.unidecode(unicode(res[5], encoding="utf-8"))
         filtered_keyword = nlp.filter_paper_keyword(paper_keyword)
         self.papers[int(res[0])] = paper.Paper(int(res[0]), title_words, int(res[2]), int(res[3]), int(res[4]), filtered_keyword)
         
         for tt in title_words.split():
           try:
             self.paper_titles[tt] = self.paper_titles[tt] + 1
           except:
             self.paper_titles[tt] = 1
         
     print "Done"
     f.close()
def main():

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'unreasonable_features.csv')):
        print 'Feature file already exists - not overwriting'
        return
        
    features = [('Number of Samples', 'A', len),
                ('Max A', 'A', max),
                ('Max B', 'B', max),
                ('Min A', 'A', min),
                ('Min B', 'B', min),
                ('Mean A', 'A', f.mean),
                ('Mean B', 'B', f.mean),
                ('Median A', 'A', f.median),
                ('Median B', 'B', f.median),
                ('Sd A', 'A', f.sd),
                ('Sd B', 'B', f.sd)]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('unreasonable_features', all_features, feature_names)
Beispiel #25
0
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'corrs.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Kendall tau', ['A','B'], f.kendall),
                ('Kendall tau p', ['A','B'], f.kendall_p),
                ('Mann Whitney', ['A','B'], f.mannwhitney),
                ('Mann Whitney p', ['A','B'], f.mannwhitney_p),
                #('Wilcoxon', ['A','B'], f.wilcoxon),
                #('Wilcoxon p', ['A','B'], f.wilcoxon_p),
                ('Kruskal', ['A','B'], f.kruskal),
                ('Kruskal p', ['A','B'], f.kruskal_p),
                ]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('corrs', all_features, feature_names)
Beispiel #26
0
def prediction(n_train_samples):
    proc_test_samples_file = get_paths()["proc_test_samples_path"]
    if os.path.exists(proc_test_samples_file):
        print "Loading processed test data..."
        new_test_samples = pd.read_csv(proc_test_samples_file)
    else:
        print "Reading test data..."
        test_samples = data_io.read_test()
        test_samples = test_samples.fillna(value=0)
        print "Porcessing test samples"
        new_test_samples = process_test_samples(test_samples)
        new_test_samples.to_csv(proc_test_samples_file, index=None)
    test_feature = new_test_samples.values

    print "Loading the Random Forest Classifier"
    rf_classifier = data_io.load_model(model_name="rf_classifier.pkl")
    print "Random Forest Predicting"
    rf_predictions = rf_classifier.predict_proba(test_feature)[:, 1]

    print "Loading the Gradient Boosting Classifier"
    gb_classifier = data_io.load_model(model_name="gb_classifier.pkl")
    print "Gradient Boosting Predicting"
    gb_predictions = gb_classifier.predict_proba(test_feature)[:, 1]

    print "Loading the SGD Classifier"
    sgd_classifier = data_io.load_model(model_name="sgd_classifier.pkl")
    print "SGD Predicting"
    sgd_predictions = sgd_classifier.predict_proba(test_feature)[:, 1]

    prob_arr = np.vstack((rf_predictions, gb_predictions, sgd_predictions))
    mean_score = np.mean(prob_arr, axis=0)
    mean_score = -1.0 * mean_score

    mean_recommendations = zip(new_test_samples["srch_id"], new_test_samples["prod_id"], mean_score)

    print "Writing predictions to file"
    data_io.write_submission(mean_recommendations, submission_file="mean_result_%i.csv" % n_train_samples)
 def parse_paperauthors(self):
     # Update all journal/conference/coauthor information
     print "Parsing PaperAuthors..."
     f = open(data_io.get_paths()["paperauthor_processed_path"], "r")
     titles = f.readline()
     count = 0
     for l in f:
         count += 1
         if count % 100000 == 0:
           print count
         res = l.strip().split(",")
         if not res[0].isdigit():
           continue
         paper_id = int(res[0])
         author_id = int(res[1])
         raw_author_name = unidecode.unidecode(unicode(res[2], encoding="utf-8"))
         author_name = nlp.filter_title(raw_author_name)[0]
         raw_author_affiliation = unidecode.unidecode(unicode(res[3], encoding="utf-8"))
         author_affiliation = nlp.filter_affiliation(raw_author_affiliation)
         curr_paper = self.papers.get(paper_id)
         curr_author = self.authors.get(author_id)
         self.update_paperauthor(curr_paper, curr_author, author_id, author_name, author_affiliation)
     print "Done"
     f.close()
def classify_catagory(train, test):
    print("Train-test split")
    trainX, testX, trainY, testY = train_test_split(train, test, random_state = 1)
    print "TrainX size = ", str(trainX.shape)
    print "TestX size = ", str(testX.shape)

    classifier = GradientBoostingClassifier(n_estimators=1024,
                                          random_state = 1,
                                          subsample = .8,
                                          min_samples_split=10,
                                          max_depth = 6,
                                          verbose=3)
    classifier.fit(trainX, trainY)
    print "Score = ", classifier.score(testX, testY)

    feature_importrance = classifier.feature_importances_
    logger = open(data_io.get_paths()["feature_importance_path"], "a")
    for fi in feature_importrance:
      logger.write(str(fi))
      logger.write("\n")
    logger.write("###########################################\n")
    logger.close()

    return classifier
def main(overwrite=False):

    #### TODO - sequential processing of data would significantly reduce memory demands
    
    if (not overwrite) and os.path.exists(os.path.join(data_io.get_paths()["real_feature_path"], 'injectivity.csv')):
        print 'Feature file already exists - not overwriting'
        return

    features = [('Injectivity 10', ['A','B'], f.injectivity_10),
                ('Injectivity 15', ['A','B'], f.injectivity_15),
                ('Injectivity 20', ['A','B'], f.injectivity_20),
                ('Injectivity 25', ['A','B'], f.injectivity_25),
                ('Injectivity 30', ['A','B'], f.injectivity_30),
                ('Injectivity 35', ['A','B'], f.injectivity_35),
                ('Injectivity 40', ['A','B'], f.injectivity_40)]
                
    feature_names = [name for (name, dummy1, dummy2) in features]
    
    print("Reading in the training data")
    train = data_io.read_train_pairs()

    print("Extracting features from training data")
    train_features = f.apply_features(train, features)
    
    print("Reading in the validation data")
    valid = data_io.read_valid_pairs()

    print("Extracting features from validation data")
    valid_features = f.apply_features(valid, features)
    
    # Concatenate features
    all_features = train_features
    all_features.update(valid_features)
    
    print("Writing feature file")
    data_io.write_real_features('injectivity', all_features, feature_names)
Beispiel #30
0
def main():
    data = pickle.load(open(data_io.get_paths()["valid_features"], 'rb'))
    predict_write(data, "valid")
Beispiel #31
0
import random
from data_io import get_paths
from os.path import join as path_join
import os

try:
    P_train = float(sys.argv[5])
    P_validation = float(sys.argv[6])
except IndexError:
    P_train = 0.6
    P_validation = 0.2

print "P train = %s %%" % (P_train * 100)
print "P validation = %s %%" % (P_validation * 100)
print "P test = %s %%" % ((1 - P_validation - P_train) * 100)
paths = get_paths("Settings_submission.json")

input_file = sys.argv[1]
output_file1 = path_join(paths["data_path"], "data/processed", sys.argv[2])
output_file2 = path_join(paths["data_path"], "data/processed", sys.argv[3])
output_file3 = path_join(paths["data_path"], "data/processed", sys.argv[4])

print "Input: %s " % input_file
print "Train file: %s " % output_file1
print "Validation file: %s " % output_file2
print "Test file: %s " % output_file3

run = raw_input("OK (Y/N)?")
print run
if run != "Y":
    os.exit()
Beispiel #32
0
def parse_conference(file):
    parse(file, data_io.get_paths()["conference_processed_path"])
Beispiel #33
0
    print "Loading the Random Forest Classifier"
    rf_classifier = data_io.load_model(model_name="rf_classifier.pkl")
    print "Random Forest Predicting"
    rf_predictions = rf_classifier.predict_proba(test_feature)[:, 1]

    print "Loading the Gradient Boosting Classifier"
    gb_classifier = data_io.load_model(model_name="gb_classifier.pkl")
    print "Gradient Boosting Predicting"
    gb_predictions = gb_classifier.predict_proba(test_feature)[:, 1]

    print "Loading the SGD Classifier"
    sgd_classifier = data_io.load_model(model_name="sgd_classifier.pkl")
    print "SGD Predicting"
    sgd_predictions = sgd_classifier.predict_proba(test_feature)[:, 1]

    prob_arr = np.vstack((rf_predictions, gb_predictions, sgd_predictions))
    mean_score = np.mean(prob_arr, axis=0)
    mean_score = -1.0 * mean_score

    mean_recommendations = zip(new_test_samples["srch_id"], new_test_samples["prod_id"], mean_score)

    print "Writing predictions to file"
    data_io.write_submission(mean_recommendations, submission_file="mean_result_%i.csv" % n_train_samples)


if __name__ == "__main__":
    n_train_samples = 8930723
    save_train_sample_file = "proc_train_sample_%i.csv" % n_train_samples
    processed_train_csv_file = os.path.join(get_paths()["proc_train_path"], save_train_sample_file)
    training(processed_train_csv_file)
    prediction(n_train_samples)
def parse_journal(file):
  parse(file, data_io.get_paths()["journal_processed_path"])
Beispiel #35
0
def parse_journal(file):
    parse(file, data_io.get_paths()["journal_processed_path"])
Beispiel #36
0
def do_prediction(n_trian_samples):
    proc_test_samples_file = get_paths()['proc_test_samples_path']

    if os.path.exists(proc_test_samples_file):
        print "Loading processed test data..."
        new_test_samples = pd.read_csv(proc_test_samples_file)
        print "Loading processed test data done"
    else:
        #  prediction
        print "reading test data..."
        test_samples = data_io.read_test()
        test_samples = test_samples.fillna(value=0)
        print "done."

        # process test samples
        print "processing test data..."
        new_test_samples = process_test_samples(test_samples)
        new_test_samples.to_csv(proc_test_samples_file, index=None)
        print "Processing test data done."

    test_features = new_test_samples.values

    # 5.1 random forest prediction
    print("Loading the random forest classifier")
    rf_classifier = data_io.load_model(model_name='lr_classifier.pkl')
    print("random forest Predicting")
    #  拿概率值
    rf_predictions = rf_classifier.predict_proba(test_features)[:1]

    # 5.2 Gradient Boosting prediction
    print("Loading the Gradient Boosting  classifier")
    gb_classifier = data_io.load_model(model_name='gb_classifier.pkl')
    print("Gradient Boosting  Predicting")
    gb_predictions = gb_classifier.predict_proba(test_features)[:1]

    # 5.3 SGD prediction
    print("Loading the SGD classifier")
    sgd_classifier = data_io.load_model(model_name='sgd_classifier.pkl')
    print("SGD Predicting")
    sgd_predictions = sgd_classifier.predict_proba(test_features)[:1]

    # 5.4 LR prediction
    # print("Loading the LR classifier")
    # lr_classifier = data_io.load_model(model_name='lr_classifier.pkl')
    # print("Logistic Regression Predicting")
    # lr_predictions = lr_classifier.predict_proba(test_features)[:1]

    # step 6 score fusion  把三组概率放到数组中
    prob_arr = np.vstack((rf_predictions, gb_predictions, sgd_predictions))

    # average mean   取概率的评价值  算术评价
    mean_score = np.mean(prob_arr, axis=0)
    # for sorting  几何评价,效果不太好
    # 前面的排序是升序,乘以-1 改为降序
    mean_score = -1.0 * mean_score
    # geometric mean
    gmean = stats.gmean(prob_arr, axis=0)
    # for sorting
    gmean = -1.0 * gmean

    # step 7 output result
    mean_recommendations = zip(new_test_samples['srch_id'],
                               new_test_samples['prop_id'], mean_score)
    gmean_recommendations = zip(new_test_samples['srch_id'],
                                new_test_samples['prop_id'], gmean)

    print("Writing predictions to file")
    data_io.write_submission(mean_recommendations, submission_file='mean_result_%i.csv' % n_trian_samples)
    data_io.write_submission(gmean_recommendations, submission_file='gmean_result_%i.csv' % n_trian_samples)
Beispiel #37
0
from os.path import join as path_join
import re

#import string
import logging
from nltk.tokenize.regexp import WordPunctTokenizer
from nltk.corpus import stopwords
from itertools import izip, repeat
import operator
import joblib
from collections import Counter

from gensim.corpora.textcorpus import TextCorpus
from gensim.corpora.dictionary import Dictionary
from gensim.corpora.mmcorpus import MmCorpus
paths = get_paths("Settings_loc5.json")
data_dir = paths["data_path"]
cache_dir = path_join(data_dir, "tmp")
prediction_dir = path_join(data_dir, "predictions")
tmp_dir = path_join(data_dir, "tmp")
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger('test_miislita')

regex = re.compile("^(.+;\s*)\n", re.MULTILINE)
regex_num = re.compile("\W(\d+)\W")
regex_hex = re.compile("\W([a-f0-9-]+)\W")
regex_punct = re.compile("^([:/\]\[\)\(,\.\}\{\?\!#=@'0-9]+)$")
english_stopwords = stopwords.words('english')
english_stopwords = dict(izip(english_stopwords, repeat(True, len(english_stopwords))))
#tokenizer = WordPunctTokenizer()
  #validate(f, data_io.get_paths()["valid_sol_path"], classifier)
  print classifier.score(valid_ret[:, 3:], valid_ret[:, 0])

  return classifier

def predict(f, classifier, file_path):
  file_pt = open(file_path, "r")
  title = file_pt.readline()
  output = open(data_io.get_paths()["submission_path"], "a")
  tot_fet = None
  for l in file_pt.readlines():
    res = l.split(",")
    fet = f.create_features_from_res(res)
    if tot_fet == None:
      tot_fet = fet
    else:
      tot_fet = numpy.vstack((tot_fet, fet))
    pred = classifier.predict_proba(fet[:, 3:])
    sorted_pred = sorted(zip(res[1].split(), pred[:, 1]), key=lambda a:a[1], reverse=True)
    #print sorted_pred
    output.write(res[0] + "," + " ".join(map(lambda a: a[0], sorted_pred)) + "\n")
  output.close()
  numpy.savetxt(data_io.get_paths()["test_feature_path"], tot_fet.astype(float), fmt='%f', delimiter=",")

if __name__ == "__main__":
  p = parser.Parser()
  p.parse_csv()
  f = feature.Feature(p)
  classifier = train(f, data_io.get_paths()["train_path"])
  predict(f, classifier, data_io.get_paths()["valid_path"])
Beispiel #39
0
def main():
    '''
    print("Getting features for deleted papers from the database")
    features_deleted = data_io.get_features_db("TrainDeleted")

    print("Getting features for confirmed papers from the database")
    features_conf = data_io.get_features_db("TrainConfirmed")
    '''

    features_deleted = pickle.load(
        open(data_io.get_paths()["deleted_features"], 'rb'))
    features_conf = pickle.load(
        open(data_io.get_paths()["confirmed_features"], 'rb'))

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))
              ] + [1 for x in range(len(features_conf))]

    print("Training the Classifier")
    features = np.array(features)
    target = np.array(target)
    '''
    classifier = RandomForestClassifier(n_estimators=50,
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1)
    classifier.fit(features, target)
    '''

    #Referred https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ for parameter tuning

    param_test1 = {'max_depth': [19], 'min_child_weight': [1]}

    param_test2 = {'gamma': [i / 10.0 for i in range(0, 5)]}

    param_test3 = {
        'subsample': [i / 10.0 for i in range(6, 10)],
        'colsample_bytree': [i / 10.0 for i in range(6, 10)]
    }
    '''
    gsearch1 = GridSearchCV(estimator=xgb.XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=19,
                                                    min_child_weight=1, gamma=0.1, subsample=0.9, colsample_bytree=0.9,
                                                    objective='binary:logistic', scale_pos_weight=1,
                                                    seed=27), param_grid=param_test1, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch1.fit(features, target)
    print(gsearch1.grid_scores_)
    print(gsearch1.best_params_)
    print(gsearch1.best_score_)
    exit()
    '''
    '''
    classifier = xgb.XGBClassifier(learning_rate=0.03, n_estimators=300, max_depth=19,
                                                    min_child_weight=1, gamma=0.1, subsample=0.9, colsample_bytree=0.9,
                                                    objective='binary:logistic', seed=27).fit(features, target)
    '''
    '''
    classifier = RandomForestClassifier(n_estimators=50,
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1).fit(features, target)
    '''
    '''
    print(len(features))
    a = np.random.permutation(len(features))[0:10000]
    features = features[a]
    target = target[a]
    classifier = svm.SVC(probability=True).fit(features, target)
    '''

    #classifier = GaussianNB().fit(features, target)

    classifier = xgb.XGBClassifier(max_depth=5,
                                   n_estimators=300,
                                   learning_rate=0.05,
                                   objective="binary:logistic").fit(
                                       features, target)

    print("Saving the classifier")
    data_io.save_model(classifier)

    # accuracy 0.9729 for valid set
    #classifier = xgb.XGBClassifier(max_depth=5, n_estimators=300, learning_rate=0.05, objective="binary:logistic").fit(features, target)
    ''' accuracy 0.9723 for valid set
        # Create Papers
        print "Parsing Papers..."
        cursor.execute("SELECT * from Paper;")
        for res in cursor:
            self.papers[res[0]] = paper.Paper(res[0], res[1], res[2], res[3], res[4], res[5])
        print "Done"
                
        # First Update all journal/conference/coauthor information
        print "Parsing PaperAuthors..."
        cursor.execute("SELECT * from PaperAuthor;")
        for res in cursor:
            paper_id = res[0]
            author_id = res[1]
            curr_author = None
            curr_paper = None
            if paper_id in self.papers.keys():
                curr_paper = self.papers[paper_id]
            if author_id in self.authors.keys():
                curr_author = self.authors[author_id]
            self.update_paperauthor(curr_paper, curr_author, author_id)
        print "Done"
    
        
if __name__ == "__main__":
    p = Parser()
    p.parse_csv()
    with open(data_io.get_paths()["parser_path"], "wb") as output:
        pickle.dump(p, output)
        
Beispiel #41
0
def process_train_samples(samples, max_srch_size=10, each_saved_size=1000000):
    '''
    func:
    Process samples including feature extraction and downsampling
    MB:samples with the same srch_id that have one positive traget
    are treated as one positive sample,otherwise,are negative samples
   max_srch_size  就是每一个srch_id最多有几行数据,
   比如 train数据集中,一个srch_id有20行,但是我们只随机取到10行就可以了,这就做了个downsampling
    因为训练集很大,所以就每处理100万条,就生成一个文件,并训练
    '''

    # 训练集数据乱序,这里先拍下序,相同srch_id的数据放一块
    sorted_samples = samples.sort_values(by=['srch_id'])  # grou by srch_id
    sorted_samples = sorted_samples.reset_index(drop=True)  # reset row index
    processed_samples = pd.DataFrame()

    samples_in_one_srch = pd.DataFrame()
    # for 循环处理的就是下一个srch_id是不是与上一个相同
    for r_idx, sample in sorted_samples.iterrows():
        if (r_idx + 1) % 1000 == 0:
            print "processed %i sample of %i " % (r_idx + 1, sorted_samples.shape[0])

        is_next_in_same_search = True
        samples_in_one_srch = pd.concat((sample.to_frame().transpose(), samples_in_one_srch), axis=0)

        current_srch_id = sample['srch_id']

        # 最后一行
        if (r_idx + 1) == sorted_samples.shape[0]:
            is_next_in_same_search = False
        else:
            next_srch_id = sorted_samples['srch_id'][r_idx + 1]
            if current_srch_id != next_srch_id:
                is_next_in_same_search = False

        # 正好是一组srch_id ,进行特征提取
        # 16G内存,跑了8小时,这部分处理速度慢
        if not is_next_in_same_search:
            ## if next one is not in the same search process the samples in the same search

            # feature extraction for samples
            ext_samples_in_one_srch = extract_features(samples_in_one_srch)

            # downsample samples  同一个srch_id下有多少samples
            n_samples = ext_samples_in_one_srch.shape[0]

            # 比如设定为10,这里大于10
            if n_samples > max_srch_size:
                # if too many samples in one search,do downsampling
                if np.any(ext_samples_in_one_srch['booking_bool']):
                    # if this is a positive sample(1 exists in booking_bool)
                    # 有预定酒店的数据  正样本需要留下了
                    pos_samples = ext_samples_in_one_srch[ext_samples_in_one_srch['booking_bool'] == 1]
                    neg_samples = ext_samples_in_one_srch[ext_samples_in_one_srch['booking_bool'] == 0]
                    # 然后在负样本里,随机选择  eg: samples 28条数据, 设定为10 ,有1条正样本,则在剩下的数据中随机选择9条
                    selected_neg_samples = neg_samples.sample(n=max_srch_size - pos_samples.shape[0])
                    selected_samples = pd.concat((pos_samples, selected_neg_samples), axis=0)
                else:
                    # 没有正样本数据,就都随机选择了
                    # if this is a negative sample,random select max_srch_size
                    selected_samples = ext_samples_in_one_srch.sample(n=max_srch_size)
            else:
                #
                selected_samples = ext_samples_in_one_srch.copy()

            processed_samples = pd.concat((processed_samples, selected_samples), axis=0)

            # create new samples for the next search
            samples_in_one_srch = pd.DataFrame()

        # 每100万条,存储下来
        if (r_idx + 1) % each_saved_size == 0:
            # save samples for every each_saved_size
            save_file_name = 'proc_train_samples_%i.csv' % (r_idx + 1)
            save_path = get_paths()['proc_train_path']
            if not os.path.exists(save_path):
                os.mkdir(save_path)

            if np.any(np.isnan(processed_samples.values)):
                # remove nan
                processed_samples = processed_samples.fillna(value=0)
                print "remove nan."
            processed_samples.to_csv(os.path.join(save_path, save_file_name), index=None)

    # out of loop save all processed samples
    save_file_name = 'proc_train_samples_%i.csv' % (r_idx + 1)
    save_path = get_paths()['proc_train_path']
    if np.any(np.isnan(processed_samples.values)):
        # remove nan
        processed_samples = processed_samples.fillna(value=0)
        print "remove nan."
    processed_samples.to_csv(os.path.join(save_path, save_file_name), index=None)
Beispiel #42
0
from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
import joblib


def log_mean_absolute_error(y_true, y_pred):
    return mean_absolute_error(np.exp(y_true), np.exp(y_pred))


paths = get_paths("Settings_loc5.json")
data_dir = paths["data_path"]
cache_dir = path_join(data_dir, "tmp")
prediction_dir = path_join(data_dir, "predictions")


names = ["Category", "ContractTime", "ContractType", "Loc1", "Loc2", "Loc3", "Loc4", "Loc5", "Company", "SourceName"]
le_features = map(lambda x: label_encode_column_fit_only(
    x, file_id="train_full_data_path", type_n="train_full"), names)

features = map(lambda (le, name): label_encode_column_transform(le, name, file_id="train_data_path", type_n="train"), zip(le_features, names))

description_length = map(len, read_column(paths["train_data_path"], "FullDescription"))
title_length = map(len, read_column(paths["train_data_path"], "Title"))

features.append(description_length)
Beispiel #43
0
    title = file_pt.readline()
    output = open(data_io.get_paths()["submission_path"], "a")
    tot_fet = None
    for l in file_pt.readlines():
        res = l.split(",")
        fet = f.create_features_from_res(res)
        if tot_fet == None:
            tot_fet = fet
        else:
            tot_fet = numpy.vstack((tot_fet, fet))
        pred = classifier.predict_proba(fet[:, 3:])
        sorted_pred = sorted(zip(res[1].split(), pred[:, 1]),
                             key=lambda a: a[1],
                             reverse=True)
        #print sorted_pred
        output.write(res[0] + "," +
                     " ".join(map(lambda a: a[0], sorted_pred)) + "\n")
    output.close()
    numpy.savetxt(data_io.get_paths()["test_feature_path"],
                  tot_fet.astype(float),
                  fmt='%f',
                  delimiter=",")


if __name__ == "__main__":
    p = parser.Parser()
    p.parse_csv()
    f = feature.Feature(p)
    classifier = train(f, data_io.get_paths()["train_path"])
    predict(f, classifier, data_io.get_paths()["valid_path"])
def parse_paperauthor(file):
  parse(file, data_io.get_paths()["paperauthor_processed_path"])
                     label_encode_column_fit, label_encode_column_transform,
                     load_predictions, fit_predict, write_submission)
from os.path import join as path_join
#import joblib
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.feature_extraction.text import CountVectorizer
import joblib


def log_mean_absolute_error(y_true, y_pred):
    return mean_absolute_error(np.exp(y_true), np.exp(y_pred))


paths = get_paths("Settings_submission.json")
data_dir = paths["data_path"]
cache_dir = path_join(data_dir, "tmp")
prediction_dir = path_join(data_dir, "predictions")

le_category, category_train = label_encode_column_fit("Category")
category_valid = label_encode_column_transform(le_category, "Category")

le_contractTime, contractTime_train = label_encode_column_fit("ContractTime")
contractTime_valid = label_encode_column_transform(le_contractTime,
                                                   "ContractTime")

le_contractType, contractType_train = label_encode_column_fit("ContractType")
contractType_valid = label_encode_column_transform(le_contractType,
                                                   "ContractType")
features = join_features(
import joblib
from data_io import (write_submission, get_paths)
import numpy as np
from os.path import join as path_join

paths = get_paths("Settings.json")
data_dir = paths["data_path"]
cache_dir = path_join(data_dir, "tmp")
prediction_dir = path_join(data_dir, "predictions")

model_name = "vowpal_submission"
type_n = "submission"
predictions = joblib.load(
    path_join(prediction_dir, model_name + "_prediction_" + type_n))
model_name = "vowpal_submission_round"
predictions = np.exp(predictions)
predictions = predictions / 1000
#print predictions[1:10]
predictions = np.round(predictions) * 1000
joblib.dump(predictions,
            path_join(prediction_dir, model_name + "_prediction_" + type_n))
write_submission("vowpal_fastml_round.csv",
                 "vowpal_submission_round_prediction_submission",
                 unlog=False)
Beispiel #47
0
    # step 6 score fusion  把三组概率放到数组中
    prob_arr = np.vstack((rf_predictions, gb_predictions, sgd_predictions))

    # average mean   取概率的评价值  算术评价
    mean_score = np.mean(prob_arr, axis=0)
    # for sorting  几何评价,效果不太好
    # 前面的排序是升序,乘以-1 改为降序
    mean_score = -1.0 * mean_score
    # geometric mean
    gmean = stats.gmean(prob_arr, axis=0)
    # for sorting
    gmean = -1.0 * gmean

    # step 7 output result
    mean_recommendations = zip(new_test_samples['srch_id'],
                               new_test_samples['prop_id'], mean_score)
    gmean_recommendations = zip(new_test_samples['srch_id'],
                                new_test_samples['prop_id'], gmean)

    print("Writing predictions to file")
    data_io.write_submission(mean_recommendations, submission_file='mean_result_%i.csv' % n_trian_samples)
    data_io.write_submission(gmean_recommendations, submission_file='gmean_result_%i.csv' % n_trian_samples)


if __name__ == "__main__":
    n_train_samples = 8930723
    saved_train_sample_file = 'proc_train_samples_%i.csv' % n_train_samples
    processed_train_csv_file = os.path.join(get_paths()['proc_train_path'], saved_train_sample_file)
    do_training(processed_train_csv_file)
    do_prediction(n_train_samples)
Beispiel #48
0
        # Create Papers
        print "Parsing Papers..."
        cursor.execute("SELECT * from Paper;")
        for res in cursor:
            self.papers[res[0]] = paper.Paper(res[0], res[1], res[2], res[3],
                                              res[4], res[5])
        print "Done"

        # First Update all journal/conference/coauthor information
        print "Parsing PaperAuthors..."
        cursor.execute("SELECT * from PaperAuthor;")
        for res in cursor:
            paper_id = res[0]
            author_id = res[1]
            curr_author = None
            curr_paper = None
            if paper_id in self.papers.keys():
                curr_paper = self.papers[paper_id]
            if author_id in self.authors.keys():
                curr_author = self.authors[author_id]
            self.update_paperauthor(curr_paper, curr_author, author_id)
        print "Done"


if __name__ == "__main__":
    p = Parser()
    p.parse_csv()
    with open(data_io.get_paths()["parser_path"], "wb") as output:
        pickle.dump(p, output)
from scipy.optimize import curve_fit

# _________________________________________________________________________________________________________

# test_person = 'me'
#est_person = 'me02'
test_person = 'gen'
# test_person = 'marie01'
# test_person = 'marie02'

footage_file_name = test_person + '.0001.mov'
render_file_name = test_person + '_tracked_calib.0001.avi'
render_l_eye_file_name = test_person + '_eye_l_tracked.0001.avi'
render_r_eye_file_name = test_person + '_eye_r_tracked.0001.avi'

footage_folder, render_folder, config_path, data_path = data_io.get_paths(
    test_person)

print('footage folder:' + footage_folder)
print('render folder:' + render_folder)

predictor_file_path = config_path + 'shape_predictor_68_face_landmarks.dat'

lmarks_left_eye = [42, 43, 44, 45, 46, 47]  # starting from inner corner -> up
lmarks_right_eye = [36, 37, 38, 39, 40, 41]  # starting from outer corner -> up

curr_lmpoints_r_eye_x = []
curr_lmpoints_r_eye_y = []
curr_lmpoints_l_eye_x = []
curr_lmpoints_l_eye_y = []

roi_eye_offset_x = 260
def parse_conference(file):
  parse(file, data_io.get_paths()["conference_processed_path"])
import joblib
from data_io import (
    write_submission,
    get_paths
)
import numpy as np
from os.path import join as path_join
paths = get_paths("Settings.json")
data_dir = paths["data_path"]
cache_dir = path_join(data_dir, "tmp")
prediction_dir = path_join(data_dir, "predictions")

model_name = "vowpal_submission"
type_n = "submission"
predictions = joblib.load(path_join(prediction_dir, model_name + "_prediction_" + type_n))
model_name = "vowpal_submission_round"
predictions = np.exp(predictions)
predictions = predictions / 1000
#print predictions[1:10]
predictions = np.round(predictions) * 1000
joblib.dump(predictions, path_join(prediction_dir, model_name + "_prediction_" + type_n))
write_submission("vowpal_fastml_round.csv", "vowpal_submission_round_prediction_submission", unlog=False)
Beispiel #52
0
def parse_paperauthor(file):
    parse(file, data_io.get_paths()["paperauthor_processed_path"])