Ejemplo n.º 1
0
def typoIt(judgmentInFile, judgmentOutFile, rounds=100):
    with open(judgmentInFile) as f:
        currJudgments = [judg for judg in judgments_from_file(f)]
    lastQid = currJudgments[-1].qid
    judgDict = judgments_by_qid(currJudgments)

    existingTypos = set()

    for i in range(0, rounds):

        for qid, judglist in judgDict.items():
            keywords = judglist[0].keywords
            keywordsWTypo = butterfingers(keywords)

            if keywordsWTypo != keywords and keywordsWTypo not in existingTypos:
                newQid = lastQid+1
                print("%s => %s" % (keywords, keywordsWTypo))
                lastQid += 1
                for judg in judglist:
                    typoJudg = Judgment(grade=judg.grade,
                                        qid=newQid,
                                        keywords=keywordsWTypo,
                                        doc_id=judg.doc_id)
                    currJudgments.append(typoJudg)
                existingTypos.add(keywordsWTypo)

    with open(judgmentOutFile, 'w') as f:
        judgments_to_file(f, judgmentsList=currJudgments)
Ejemplo n.º 2
0
def train():
    from judgments import judgments_from_file, judgments_by_qid

    es = elastic_connection(timeout=1000)
    # Load features into Elasticsearch
    init_default_store()
    load_features(FEATURE_SET_NAME)
    # Parse a judgments
    movieJudgments = judgments_by_qid(
        judgments_from_file(filename=JUDGMENTS_FILE))
    # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set
    # output as "sample_judgments_wfeatures.txt"
    log_features(es, judgments_dict=movieJudgments, search_index=INDEX_NAME)
    build_features_judgments_file(movieJudgments,
                                  filename=JUDGMENTS_FILE_FEATURES)
    # Train each ranklib model type
    for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
        # 0, MART
        # 1, RankNet
        # 2, RankBoost
        # 3, AdaRank
        # 4, coord Ascent
        # 6, LambdaMART
        # 7, ListNET
        # 8, Random Forests
        # 9, Linear Regression
        Logger.logger.info("*** Training %s " % modelType)
        train_model(judgments_with_features_file=JUDGMENTS_FILE_FEATURES,
                    model_output='model.txt',
                    which_model=modelType)
        save_model(script_name="gsearch_model_%s" % modelType,
                   feature_set=FEATURE_SET_NAME,
                   model_fname='model.txt')
Ejemplo n.º 3
0
                             auth=ES_AUTH,
                             verify=False)
        Logger.logger.info(resp.status_code)
        if resp.status_code >= 300:
            Logger.logger.error(resp.text)


if __name__ == "__main__":
    from judgments import judgments_from_file, judgments_by_qid

    es = elastic_connection(timeout=1000)
    # Load features into Elasticsearch
    init_default_store()
    load_features(FEATURE_SET_NAME)
    # Parse a judgments
    movieJudgments = judgments_by_qid(
        judgments_from_file(filename=JUDGMENTS_FILE))
    # Use proposed Elasticsearch queries (1.json.jinja ... N.json.jinja) to generate a training set
    # output as "sample_judgments_wfeatures.txt"
    log_features(es, judgments_dict=movieJudgments, search_index=INDEX_NAME)
    build_features_judgments_file(movieJudgments,
                                  filename=JUDGMENTS_FILE_FEATURES)
    # Train each ranklib model type
    #for modelType in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    for modelType in [6]:
        # 0, MART
        # 1, RankNet
        # 2, RankBoost
        # 3, AdaRank
        # 4, coord Ascent
        # 6, LambdaMART
        # 7, ListNET
        # Add feature back to each judgment
        features_per_doc = {}
        for doc in res['hits']['hits']:
            docId = doc['_id']
            features = doc['fields']['_ltrlog'][0]['main']
            features_per_doc[docId] = feature_dict_to_list(features)

        # Append features from ES back to ranklib judgment list
        for judgment in judgments:
            try:
                features = features_per_doc[
                    judgment.docId]  # If KeyError, then we have a judgment but no file in index
                judgment.features = features
            except Exception as e:
                print(e)
                Logger.logger.info("Missing id %s" % judgment.docId)


def build_features_judgments_file(judgments_with_features, filename):
    with open(filename, 'w') as judgmentFile:
        for qid, judgmentList in judgments_with_features.items():
            for judgment in judgmentList:
                judgmentFile.write(judgment.to_ranklib_format() + "\n")


if __name__ == "__main__":
    es_connection = elastic_connection()
    judgmentsByQid = judgments_by_qid(judgments_from_file(JUDGMENTS_FILE))
    log_features(es_connection, judgmentsByQid, INDEX_NAME)
    build_features_judgments_file(judgmentsByQid, JUDGMENTS_FILE_FEATURES)