コード例 #1
0
def extract_distance_features(df):	
    join_str="_"
    df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"], "stem"), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["description"]), axis=1))
    df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
    df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
    df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))
    df["attribute_values_unigram"] = list(df.apply(lambda x: preprocess_data(x["values"], "stem"), axis=1))
    df["attribute_values_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["attribute_values_unigram"]), axis=1))
    df["attribute_values_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["attribute_values_unigram"]), axis=1))
    #calculate distance

    print "generate jaccard coef and dice dist for n-gram"
    dists = ["jaccard_coef", "dice_dist"]
    grams = ["unigram", "bigram", "trigram"]
    feat_names = ["query", "title", "description", "attribute_values"]
    for dist in dists:
        print "Generating ",dist
        for gram in grams:
            for i in range(len(feat_names)-1):
                for j in range(i+1,len(feat_names)):
                    target_name = feat_names[i]
                    obs_name = feat_names[j]
                    df["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \
                            list(df.apply(lambda x: compute_dist(x[target_name+"_"+gram], x[obs_name+"_"+gram], dist), axis=1))

    print "Dropping columns"
    df=df.drop(['query_unigram', 'title_unigram', 'description_unigram', 'query_bigram','title_bigram','description_bigram', 'query_trigram', 'title_trigram', 'description_trigram', 'attribute_values_unigram', 'attribute_values_bigram', 'attribute_values_trigram'], axis=1)                      
    print "Creating csv"
    df.to_csv("../../data/feat/test_distFeat.csv", header=True, index=False)
    return df
コード例 #2
0
def extract_basic_distance_feat(df):
    ## unigram
    print "generate unigram"
    df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1))
    ## bigram
    print "generate bigram"
    join_str = "_"
    df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    ## trigram
    print "generate trigram"
    join_str = "_"
    df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
    df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
    df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))

    ## jaccard coef/dice dist of n-gram
    print "generate jaccard coef and dice dist for n-gram"
    dists = ["jaccard_coef", "dice_dist"]
    grams = ["unigram", "bigram", "trigram"]
    feat_names = ["query", "title", "description"]
    for dist in dists:
        for gram in grams:
            for i in range(len(feat_names)-1):
                for j in range(i+1,len(feat_names)):
                    target_name = feat_names[i]
                    obs_name = feat_names[j]
                    df["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \
                            list(df.apply(lambda x: compute_dist(x[target_name+"_"+gram], x[obs_name+"_"+gram], dist), axis=1))
コード例 #3
0
def extract_basic_distance_feat(df):
    ## unigram
    print("generate unigram")
    df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1))
    ## bigram
    print( "generate bigram")
    join_str = "_"
    df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    ## trigram
    print( "generate trigram")
    join_str = "_"
    df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
    df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
    df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))

    ## jaccard coef/dice dist of n-gram
    print( "generate jaccard coef and dice dist for n-gram")
    dists = ["jaccard_coef", "dice_dist"]
    grams = ["unigram", "bigram", "trigram"]
    feat_names = ["query", "title", "description"]
    for dist in dists:
        for gram in grams:
            for i in range(len(feat_names)-1):
                for j in range(i+1,len(feat_names)):
                    target_name = feat_names[i]
                    obs_name = feat_names[j]
                    df["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \
                            list(df.apply(lambda x: compute_dist(x[target_name+"_"+gram], x[obs_name+"_"+gram], dist), axis=1))
コード例 #4
0
ファイル: gen_feat.py プロジェクト: mochiliu3000/Kaggle_Quora
def gen_ngram_data(df):
    ## unigram
    print("generate unigram")
    df["q1_unigram"] = list(df.apply(lambda x: preprocess_data(x["question1"]), axis=1))
    df["q2_unigram"] = list(df.apply(lambda x: preprocess_data(x["question2"]), axis=1))
    ## bigram
    print("generate bigram")
    join_str = "_"
    df["q1_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["q1_unigram"], join_str), axis=1))
    df["q2_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["q2_unigram"], join_str), axis=1))
    ## trigram
    print("generate trigram")
    join_str = "_"
    df["q1_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["q1_bigram"], join_str), axis=1))
    df["q2_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["q2_bigram"], join_str), axis=1))
    return df
コード例 #5
0
def generate_ngrams(df):
    # unigram
    print("generate unigram")
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1))
    df["search_term_unigram"] = list(df.apply(lambda x: preprocess_data(x["search_term"]), axis=1))

    # bigram
    print("generate bigram")
    join_str = "_"
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["search_term_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["search_term_unigram"], join_str), axis=1))
    # trigram
    print("generate trigram")
    join_str = "_"
    df["search_term_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["search_term_unigram"], join_str), axis=1))
    df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
コード例 #6
0
def generate_brand_ngrams(df):
    print("Generate brand unigram")
    df["brand_unigram"] = list(df.apply(lambda x: preprocess_data(x["brand"]), axis=1))
    print("Generate brand bigram")
    join_str = "_"
    df["brand_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["brand_unigram"], join_str), axis=1))
    print("Generate brand trigram")
    join_str = "_"
    df["brand_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["brand_unigram"], join_str), axis=1))
コード例 #7
0
def generate_product_ngrams(df):
    print("Generate unigram")
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1))
    print("Generate bigram")
    join_str = "_"
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    print("Generate trigram")
    join_str = "_"
    df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))
コード例 #8
0
def generateNGram(df):
    # unigram
    df['query_unigram'] = df['query'].apply(lambda x: ngram.getUnigram(x))
    df['title_unigram'] = df['product_title'].apply(
        lambda x: ngram.getUnigram(x))
    df['description_unigram'] = df['product_description'].apply(
        lambda x: ngram.getUnigram(x))
    # bigram
    df['query_bigram'] = df['query'].apply(lambda x: ngram.getBigram(x, '_'))
    df['title_bigram'] = df['product_title'].apply(
        lambda x: ngram.getBigram(x, '_'))
    df['description_bigram'] = df['product_description'].apply(
        lambda x: ngram.getBigram(x, '_'))
    # trigram
    df['query_trigram'] = df['query'].apply(lambda x: ngram.getTrigram(x, '_'))
    df['title_trigram'] = df['product_title'].apply(
        lambda x: ngram.getTrigram(x, '_'))
    df['description_trigram'] = df['product_description'].apply(
        lambda x: ngram.getTrigram(x, '_'))
    return df
コード例 #9
0
def extract_basic_distance_feat(df):
    ## unigram

    ## unigram
    print "generate ngrams"
    join_str = "_"

    print "generate ngrams for question1"
    df.loc[:, "question1_unigram"] = list(map(preprocess_data,
                                              df["question1"]))
    df.loc[:, "question1_bigram"] = [
        ngram.getBigram(x, join_str) for x in df["question1_unigram"]
    ]
    df.loc[:, "question1_trigram"] = [
        ngram.getTrigram(x, join_str) for x in df["question1_unigram"]
    ]

    print "generate ngrams for question2"

    df.loc[:, "question2_unigram"] = list(map(preprocess_data,
                                              df["question2"]))
    df.loc[:, "question2_bigram"] = [
        ngram.getBigram(x, join_str) for x in df["question2_unigram"]
    ]
    df.loc[:, "question2_trigram"] = [
        ngram.getTrigram(x, join_str) for x in df["question2_unigram"]
    ]

    ## jaccard coef/dice dist of n-gram
    print "generate jaccard coef and dice dist for n-gram"
    dists = ["jaccard_coef", "dice_dist"]
    grams = ["unigram", "bigram", "trigram"]
    feat_names = ["question1", "question2"]
    for dist in dists:
        for gram in grams:
            for i in range(len(feat_names) - 1):
                for j in range(i + 1, len(feat_names)):
                    target_name = feat_names[i]
                    obs_name = feat_names[j]
                    df["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \
                            map(partial(compute_dist, dist=dist), df[target_name+"_"+gram], df[obs_name+"_"+gram])
コード例 #10
0
def str_common_word_ngram(str1, str2, n):
    # what happens if length of word is less than size of gram? should return 0
    # use switcher
    if n == 1:
        return sum(int(str2.find(str(str1))>=0) for word in str1.split())
    elif n == 2:
        return sum(int(str2.find(word_ngram)>=0) for word_ngram in ngram.getBigram(str1.split()," "))
    elif n == 3:
        return sum(int(str2.find(word_ngram)>=0) for word_ngram in ngram.getTrigram(str1.split()," "))
    elif n == 4:
        return sum(int(str2.find(word_ngram)>=0) for word_ngram in ngram.getFourgram(str1.split()," "))
    else:
        print("Incorrect n value entered:",n)
        return 0
コード例 #11
0
ファイル: preprocessor.py プロジェクト: amsqr/hd
 def _get_ngram(self, sr):
     """
     Compute ngram of the text of a pd.Series. The unigram operation is combining stemming \
     words and excluding stopwords. The bigram and trigram operations are based on the results \
     of the unigram operation.
     
     Args:
         sr(pd.Series):
         
     Returns:
         sr_unigram(pd.Series), sr_bigram(pd.Series), sr_trigram(pd.Series)
     """
     # Unigram.
     unigram_func = lambda s: list(self._stem_excl_words(s))
     sr_unigram = sr.map(unigram_func)     
     # Bigram.
     bigram_func = lambda s: ngram.getBigram(s, '_')
     sr_bigram = sr_unigram.map(bigram_func) 
     # Trigram.
     trigram_func = lambda s: ngram.getTrigram(s, '_')
     sr_trigram = sr_unigram.map(trigram_func) 
     return sr_unigram, sr_bigram, sr_trigram
def test():

    ###############
    ## Load Data ##
    ###############
    ## load data
    dataPath = "./ModelSystem/ProcessedData"
    columnNames = ["query", "title", "description"]
    catagories = ["train", "test"]

    for cata in catagories:
        for col in columnNames:
            path = "%s/%s_%s.pickle" % (dataPath, col, cata)
            with open(path, "rb") as f:
                input = pickle.load(f)

            sz = len(input)
            #开始1,2,3元文法
            output_unigram = []
            output_bigram = []
            output_trigram = []
            for i in range(2):
                text = input[i]
                #去除标点
                text = re.sub("[^0-9a-zA-Z.]", " ", text)
                wordList = text.split()

                unigram = wordList
                bigram = ngram.getBigram(wordList, "_")
                trigram = ngram.getTrigram(wordList, "_")

                print(unigram)
                print(bigram)
                print(trigram)

# ret = ngram.getBigram(x["query_unigram"], join_str)

    print("ngram All Done.")
コード例 #13
0
def extract_feat(df):
    ## unigram
    print "generate ngrams"
    join_str = "_"

    print "generate ngrams for question1"
    df.loc[:, "question1_unigram"] = list(map(preprocess_data,
                                              df["question1"]))
    df.loc[:, "question1_bigram"] = [
        ngram.getBigram(x, join_str) for x in df["question1_unigram"]
    ]
    df.loc[:, "question1_trigram"] = [
        ngram.getTrigram(x, join_str) for x in df["question1_unigram"]
    ]

    print "generate ngrams for question2"

    df.loc[:, "question2_unigram"] = list(map(preprocess_data,
                                              df["question2"]))
    df.loc[:, "question2_bigram"] = [
        ngram.getBigram(x, join_str) for x in df["question2_unigram"]
    ]
    df.loc[:, "question2_trigram"] = [
        ngram.getTrigram(x, join_str) for x in df["question2_unigram"]
    ]

    ################################
    ## word count and digit count ##
    ################################
    print "generate word counting features"
    feat_names = ["question1", "question2"]
    grams = ["unigram", "bigram", "trigram"]
    count_digit = lambda x: sum([1. for w in x if w.isdigit()])
    for feat_name in feat_names:
        for gram in grams:
            ## word count
            df["count_of_%s_%s" % (feat_name, gram)] = [
                len(x) for x in df[feat_name + "_" + gram]
            ]
            df["count_of_unique_%s_%s" % (feat_name, gram)] = [
                len(set(x)) for x in df[feat_name + "_" + gram]
            ]
            df["ratio_of_unique_%s_%s" % (feat_name, gram)] = map(
                try_divide, df["count_of_unique_%s_%s" % (feat_name, gram)],
                df["count_of_%s_%s" % (feat_name, gram)])

        ## digit count
        df["count_of_digit_in_%s" % feat_name] = list(
            map(count_digit, df[feat_name + "_unigram"]))
        df["ratio_of_digit_in_%s" % feat_name] = map(
            try_divide, df["count_of_digit_in_%s" % feat_name],
            df["count_of_%s_unigram" % (feat_name)])

    ##############################
    ## intersect word count ##
    ##############################
    print "generate intersect word counting features"

    def word_count_intersect_questions(obs, target):
        word_count_intersect = 0
        if len(obs) != 0:
            word_count_intersect = len([w for w in obs if w in target])
        return word_count_intersect

    #### unigram
    for gram in grams:
        for obs_name in feat_names:
            for target_name in feat_names:
                if target_name != obs_name:
                    ## query
                    df["count_of_%s_%s_in_%s" %
                       (obs_name, gram, target_name)] = list(
                           map(word_count_intersect_questions,
                               df[obs_name + "_" + gram],
                               df[target_name + "_" + gram]))
                    df["ratio_of_%s_%s_in_%s" %
                       (obs_name, gram, target_name)] = map(
                           try_divide, df["count_of_%s_%s_in_%s" %
                                          (obs_name, gram, target_name)],
                           df["count_of_%s_%s" % (obs_name, gram)])

        ## some other feat
        df["question2_%s_in_question1_div_question1_%s" % (gram, gram)] = map(
            try_divide, df["count_of_question2_%s_in_question1" % gram],
            df["count_of_question1_%s" % gram])
        df["question2_%s_in_question1_div_question1_%s_in_question2" %
           (gram, gram)] = map(try_divide,
                               df["count_of_question2_%s_in_question1" % gram],
                               df["count_of_question1_%s_in_question2" % gram])

    ######################################
    ## intersect word position feat ##
    ######################################
    print "generate intersect word position features"
    for gram in grams:
        for target_name in feat_names:
            for obs_name in feat_names:
                if target_name != obs_name:
                    pos = list(
                        map(get_position_list, df[obs_name + "_" + gram],
                            df[target_name + "_" + gram]))
                    ## stats feat on pos
                    df["pos_of_%s_%s_in_%s_min" %
                       (obs_name, gram, target_name)] = map(np.min, pos)
                    df["pos_of_%s_%s_in_%s_mean" %
                       (obs_name, gram, target_name)] = map(np.mean, pos)
                    df["pos_of_%s_%s_in_%s_median" %
                       (obs_name, gram, target_name)] = map(np.median, pos)
                    df["pos_of_%s_%s_in_%s_max" %
                       (obs_name, gram, target_name)] = map(np.max, pos)
                    df["pos_of_%s_%s_in_%s_std" %
                       (obs_name, gram, target_name)] = map(np.std, pos)
                    ## stats feat on normalized_pos
                    df["normalized_pos_of_%s_%s_in_%s_min" %
                       (obs_name, gram, target_name)] = map(
                           try_divide, df["pos_of_%s_%s_in_%s_min" %
                                          (obs_name, gram, target_name)],
                           df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_mean" %
                       (obs_name, gram, target_name)] = map(
                           try_divide, df["pos_of_%s_%s_in_%s_mean" %
                                          (obs_name, gram, target_name)],
                           df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_median" %
                       (obs_name, gram, target_name)] = map(
                           try_divide, df["pos_of_%s_%s_in_%s_median" %
                                          (obs_name, gram, target_name)],
                           df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_max" %
                       (obs_name, gram, target_name)] = map(
                           try_divide, df["pos_of_%s_%s_in_%s_max" %
                                          (obs_name, gram, target_name)],
                           df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_std" %
                       (obs_name, gram, target_name)] = map(
                           try_divide, df["pos_of_%s_%s_in_%s_std" %
                                          (obs_name, gram, target_name)],
                           df["count_of_%s_%s" % (obs_name, gram)])
コード例 #14
0
def extract_feat(df):
    ## unigram
    print "generate unigram"
    df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1))
    ## bigram
    print "generate bigram"
    join_str = "_"
    df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    ## trigram
    print "generate trigram"
    join_str = "_"
    df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
    df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
    df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))


    ################################
    ## word count and digit count ##
    ################################
    print "generate word counting features"
    feat_names = ["query", "title", "description"]
    grams = ["unigram", "bigram", "trigram"]
    count_digit = lambda x: sum([1. for w in x if w.isdigit()])
    for feat_name in feat_names:
        for gram in grams:
            ## word count
            df["count_of_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1))
            df["count_of_unique_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1))
            df["ratio_of_unique_%s_%s"%(feat_name,gram)] = map(try_divide, df["count_of_unique_%s_%s"%(feat_name,gram)], df["count_of_%s_%s"%(feat_name,gram)])

        ## digit count
        df["count_of_digit_in_%s"%feat_name] = list(df.apply(lambda x: count_digit(x[feat_name+"_unigram"]), axis=1))
        df["ratio_of_digit_in_%s"%feat_name] = map(try_divide, df["count_of_digit_in_%s"%feat_name], df["count_of_%s_unigram"%(feat_name)])

    ## description missing indicator
    df["description_missing"] = list(df.apply(lambda x: int(x["description_unigram"] == ""), axis=1))


    ##############################
    ## intersect word count ##
    ##############################
    print "generate intersect word counting features"
    #### unigram
    for gram in grams:
        for obs_name in feat_names:
            for target_name in feat_names:
                if target_name != obs_name:
                    ## query
                    df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = list(df.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1))
                    df["ratio_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = map(try_divide, df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df["count_of_%s_%s"%(obs_name,gram)])

        ## some other feat
        df["title_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s"%gram])
        df["title_%s_in_query_div_query_%s_in_title"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s_in_title"%gram])
        df["description_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s"%gram])
        df["description_%s_in_query_div_query_%s_in_description"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s_in_description"%gram])


    ######################################
    ## intersect word position feat ##
    ######################################
    print "generate intersect word position features"
    for gram in grams:
        for target_name in feat_names:
            for obs_name in feat_names:
                if target_name != obs_name:
                    pos = list(df.apply(lambda x: get_position_list(x[target_name+"_"+gram], obs=x[obs_name+"_"+gram]), axis=1))
                    ## stats feat on pos
                    df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(np.min, pos)
                    df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(np.mean, pos)
                    df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(np.median, pos)
                    df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(np.max, pos)
                    df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(np.std, pos)
                    ## stats feat on normalized_pos
                    df["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] , df["count_of_%s_%s" % (obs_name, gram)])
コード例 #15
0
def process():

    read = False
    if not read:

        body_train = pd.read_csv("train_bodies_processed.csv",
                                 encoding='utf-8')
        stances_train = pd.read_csv("train_stances_processed.csv",
                                    encoding='utf-8')
        # training set
        train = pd.merge(stances_train, body_train, how='left', on='Body ID')
        targets = ['agree', 'disagree', 'discuss', 'unrelated']
        targets_dict = dict(zip(targets, range(len(targets))))
        train['target'] = map(lambda x: targets_dict[x], train['Stance'])
        print 'train.shape:'
        print train.shape
        n_train = train.shape[0]

        data = train
        # read test set, no 'Stance' column in test set -> target = NULL
        # concatenate training and test set
        test_flag = True
        if test_flag:
            body_test = pd.read_csv("test_bodies_processed.csv",
                                    encoding='utf-8')
            headline_test = pd.read_csv("test_stances_unlabeled.csv",
                                        encoding='utf-8')
            test = pd.merge(headline_test, body_test, how="left", on="Body ID")

            data = pd.concat((train, test))  # target = NaN for test set
            print data
            print 'data.shape:'
            print data.shape

            train = data[~data['target'].isnull()]
            print train
            print 'train.shape:'
            print train.shape

            test = data[data['target'].isnull()]
            print test
            print 'test.shape:'
            print test.shape

        #data = data.iloc[:100, :]

        #return 1

        print "generate unigram"
        data["Headline_unigram"] = data["Headline"].map(
            lambda x: preprocess_data(x))
        data["articleBody_unigram"] = data["articleBody"].map(
            lambda x: preprocess_data(x))

        print "generate bigram"
        join_str = "_"
        data["Headline_bigram"] = data["Headline_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))
        data["articleBody_bigram"] = data["articleBody_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))

        print "generate trigram"
        join_str = "_"
        data["Headline_trigram"] = data["Headline_unigram"].map(
            lambda x: ngram.getTrigram(x, join_str))
        data["articleBody_trigram"] = data["articleBody_unigram"].map(
            lambda x: ngram.getTrigram(x, join_str))

        with open('data.pkl', 'wb') as outfile:
            cPickle.dump(data, outfile, -1)
            print 'dataframe saved in data.pkl'

    else:
        with open('data.pkl', 'rb') as infile:
            data = cPickle.load(infile)
            print 'data loaded'
            print 'data.shape:'
            print data.shape
    #return 1

    # define feature generators
    countFG = CountFeatureGenerator()
    tfidfFG = TfidfFeatureGenerator()
    svdFG = SvdFeatureGenerator()
    word2vecFG = Word2VecFeatureGenerator()
    sentiFG = SentimentFeatureGenerator()
    #walignFG   = AlignmentFeatureGenerator()
    generators = [countFG, tfidfFG, svdFG, word2vecFG, sentiFG]
    #generators = [svdFG, word2vecFG, sentiFG]
    #generators = [tfidfFG]
    #generators = [countFG]
    #generators = [walignFG]

    for g in generators:
        g.process(data)

    for g in generators:
        g.read('train')

    #for g in generators:
    #    g.read('test')

    print 'done'
コード例 #16
0
def process():

    read = False
    if not read:
        '''
        body_train = pd.read_csv("train_bodies_processed.csv", encoding='utf-8')
        stances_train = pd.read_csv("train_stances_processed.csv", encoding='utf-8')
        # training set
        train = pd.merge(stances_train, body_train, how='left', on='Body ID')
        
        train.head()
        targets = ['agree', 'disagree', 'discuss', 'unrelated']
        targets_dict = dict(zip(targets, range(len(targets))))
        train['target'] = map(lambda x: targets_dict[x], train['Stance'])
        print ('train.shape:')
        print (train.shape)
        n_train = train.shape[0]
        '''
        #sample starts

        sample_head = "Italy culls birds after five H5N8 avian flu outbreaks in October"
        sample_body = "ROME (Reuters) - Italy has had five outbreaks of highly pathogenic H5N8 avian flu in farms the central and northern parts of the country since the start of the month and about 880,000 chickens, ducks and turkeys will be culled, officials said on Wednesday.\
            The biggest outbreak of the H5N8 virus, which led to the death or killing of millions of birds in an outbreak in western Europe last winter, was at a large egg producing farm in the province of Ferrara.\
            The latest outbreak was confirmed on Oct. 6 and about 853,000 hens are due to be culled by Oct. 17, the IZSV zoological institute said.\
            Another involved 14,000 turkeys in the province of Brescia, which are due to be culled by Oct. 13.\
            A third involved 12,400 broiler chickens at a smaller farm in the province of Vicenza and two others were among a small number of hens, ducks, broilers and turkeys on family farms.\
            In those three cases, all the birds have been culled."

        sample_head_pd = pd.DataFrame([sample_head])
        sample_body_pd = pd.DataFrame([sample_body])
        sample_data_pd = pd.concat((sample_head_pd, sample_body_pd), axis=1)
        sample_data_pd.columns = ['Headline', 'articleBody']
        sample_data_pd['URLs'] = np.nan
        sample_data_pd['Stance'] = np.nan

        #sample ends

        dataset = pd.read_csv('data.csv')

        dataset.isnull().sum()

        dataset = dataset[pd.notnull(dataset['Body'])]

        dataset.columns = ['URLs', 'Headline', 'articleBody', 'Stance']

        X_data = dataset.iloc[:, 1:3]
        Y_data = dataset.iloc[:, 3]

        from sklearn.cross_validation import train_test_split

        X_train, X_test, Y_train, Y_test = train_test_split(X_data,
                                                            Y_data,
                                                            test_size=0.25,
                                                            random_state=0)

        train = pd.concat([X_train, Y_train], axis=1)

        train.to_csv('gdbt_training_input.csv', index=False)

        X_test.to_csv('gdbt_testing_input.csv', index=False)
        Y_test = pd.DataFrame(Y_test)
        Y_test.to_csv('gdbt_testing_ouput.csv', index=False)

        targets = ['Fake', 'Real']
        targets_dict = dict(zip(targets, range(len(targets))))
        train['target'] = map(lambda x: targets_dict[x], train['Stance'])

        data = train

        # read test set, no 'Stance' column in test set -> target = NULL
        # concatenate training and test set
        test_flag = True

        if test_flag:
            '''
            body_test = pd.read_csv("test_bodies_processed.csv", encoding='utf-8')
            headline_test = pd.read_csv("test_stances_unlabeled.csv", encoding='utf-8')
            test = pd.merge(headline_test, body_test, how="left", on="Body ID")
            '''
            data = pd.concat((train, X_test))  # target = NaN for test set
            #print (data)
            print('data.shape:')
            print(data.shape)

            train = data[~data['target'].isnull()]
            print(train)
            print('train.shape:')
            print(train.shape)

            test = data[data['target'].isnull()]
            print(test)
            print('test.shape:')
            print(test.shape)

        #data = data.iloc[:100, :]

        #return 1

        print("generate unigram")
        data["Headline_unigram"] = data["Headline"].map(
            lambda x: preprocess_data(x))
        print(data.head())
        data["articleBody_unigram"] = data["articleBody"].map(
            lambda x: preprocess_data(x))

        print("generate bigram")
        join_str = "_"
        data["Headline_bigram"] = data["Headline_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))
        data["articleBody_bigram"] = data["articleBody_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))

        print("generate trigram")
        join_str = "_"
        data["Headline_trigram"] = data["Headline_unigram"].map(
            lambda x: ngram.getTrigram(x, join_str))
        data["articleBody_trigram"] = data["articleBody_unigram"].map(
            lambda x: ngram.getTrigram(x, join_str))

        with open('data.pkl', 'wb') as outfile:
            pickle.dump(data, outfile, -1)
            print('dataframe saved in data.pkl')

    else:
        with open('data.pkl', 'rb') as infile:
            data = pickle.load(infile)
            print('data loaded')
            print('data.shape:')
            print(data.shape)
    #return 1

    # define feature generators
    countFG = CountFeatureGenerator()
    tfidfFG = TfidfFeatureGenerator()
    svdFG = SvdFeatureGenerator()
    word2vecFG = Word2VecFeatureGenerator()
    sentiFG = SentimentFeatureGenerator()
    #walignFG   = AlignmentFeatureGenerator()
    generators = [countFG, tfidfFG, svdFG, word2vecFG, sentiFG]
    #generators = [svdFG, word2vecFG, sentiFG]
    #generators = [tfidfFG]
    #generators = [countFG]
    #generators = [walignFG]

    #countFG.process(data)
    #countFG.read()

    #word2vecFG.process(data)

    #sentiFG.process(data)

    for g in generators:
        g.process(data)

    for g in generators:
        g.read('train')

    for g in generators:
        g.read('test')

    print('done')
コード例 #17
0
def process():

    full_data = pd.read_csv('./data/merged_data_tain.csv', encoding='utf-8')
    used_column = [
        'claimHeadline', 'articleHeadline', 'claimTruthiness', 'articleStance'
    ]
    full_data = full_data[used_column]
    full_data = full_data.dropna()
    train, test = train_test_split(full_data,
                                   test_size=0.33,
                                   random_state=1234)

    read = False
    if not read:

        targets = ['observing', 'for', 'against', 'ignoring']
        targets_dict = dict(zip(targets, range(len(targets))))
        train['target'] = map(lambda x: targets_dict[x],
                              train['articleStance'])
        print 'train.shape:'
        print train.shape
        n_train = train.shape[0]

        data = train
        # read test set, no 'Stance' column in test set -> target = NULL
        # concatenate training and test set
        test_flag = True
        if test_flag:

            data = train
            print data
            print 'data.shape:'
            print data.shape

            train = data[~data['target'].isnull()]
            print train
            print 'train.shape:'
            print train.shape

            test = data[data['target'].isnull()]
            print test
            print 'test.shape:'
            print test.shape

        #data = data.iloc[:100, :]

        #return 1

        print "generate unigram"
        data["claimHeadline_unigram"] = data["claimHeadline"].map(
            lambda x: preprocess_data(x))
        data["articleHeadline_unigram"] = data["articleHeadline"].map(
            lambda x: preprocess_data(x))

        print "generate bigram"
        join_str = "_"
        data["claimHeadline_bigram"] = data["claimHeadline_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))
        data["articleHeadline_bigram"] = data["articleHeadline_unigram"].map(
            lambda x: ngram.getBigram(x, join_str))

        print "generate trigram"
        join_str = "_"
        data["claimHeadline_trigram"] = data["claimHeadline_unigram"].map(
            lambda x: ngram.getTrigram(x, join_str))
        data["articleHeadline_trigram"] = data["articleHeadline_bigram"].map(
            lambda x: ngram.getTrigram(x, join_str))

        with open('data.pkl', 'wb') as outfile:
            cPickle.dump(data, outfile, -1)
            print 'dataframe saved in data.pkl'

    else:
        with open('data.pkl', 'rb') as infile:
            data = cPickle.load(infile)
            print 'data loaded'
            print 'data.shape:'
            print data.shape
    #return 1

    # define feature generators
    countFG = CountFeatureGenerator()
    tfidfFG = TfidfFeatureGenerator()
    svdFG = SvdFeatureGenerator()
    word2vecFG = Word2VecFeatureGenerator()
    sentiFG = SentimentFeatureGenerator()
    #walignFG   = AlignmentFeatureGenerator()
    generators = [countFG, tfidfFG, svdFG, word2vecFG, sentiFG]
    #generators = [svdFG, word2vecFG, sentiFG]
    #generators = [tfidfFG]
    #generators = [countFG]
    #generators = [walignFG]

    for g in generators:
        g.process(data)

    for g in generators:
        g.read('train')

    #for g in generators:
    #    g.read('test')

    print 'done'
コード例 #18
0
def extract_feat(df):
    ## unigram
    print("generate unigram")
    df["question1_unigram"] = list(df.apply(lambda x: preprocess_data(x["question1"]), axis=1))
    df["question2_unigram"] = list(df.apply(lambda x: preprocess_data(x["question2"]), axis=1))
    ## bigram
    print("generate bigram")
    join_str = "_"
    df["question1_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["question1_unigram"], join_str), axis=1))
    df["question2_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["question2_unigram"], join_str), axis=1))
    ## trigram
    print("generate trigram")
    join_str = "_"
    df["question1_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["question1_unigram"], join_str), axis=1))
    df["question2_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["question2_unigram"], join_str), axis=1))

    ################################
    ## word count and digit count ##
    ################################
    print("generate word counting features")
    feat_names = ["question1", "question2"]
    grams = ["unigram", "bigram", "trigram"]
    count_digit = lambda x: sum([1. for w in x if w.isdigit()])
    for feat_name in feat_names:
        for gram in grams:
            ## word count 单词数量
            df["count_of_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1))    # 单词数量
            df["count_of_unique_%s_%s"%(feat_name, gram)] = list(df.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1))   # 不重复单词数量
            df["ratio_of_unique_%s_%s"%(feat_name, gram)] = list(map(try_divide, df["count_of_unique_%s_%s"%(feat_name,gram)], df["count_of_%s_%s"%(feat_name,gram)]))  # 不重复单词占比

        ## digit count 数字数量
        df["count_of_digit_in_%s"%feat_name] = list(df.apply(lambda x: count_digit(x[feat_name+"_unigram"]), axis=1))   # 数字数量
        df["ratio_of_digit_in_%s"%feat_name] = list(map(try_divide, df["count_of_digit_in_%s"%feat_name], df["count_of_%s_unigram"%(feat_name)]))   # 数字占比

        # ## letter count 字母数量
        # df["count_of_letter_in_%s" % feat_name] = list( df.apply(lambda x: len(x[feat_name]), axis=1))

    ####################################
    ## subtract word and letter count ##
    ####################################
    print("generate subtract word counting features")
    #### unigram
    for obs_name in feat_names:
        for target_name in feat_names:
            if target_name != obs_name:
                ## word count 单词数量差
                df["count_of_%s_%s_subtract_%s" % (obs_name,  "unigram", target_name)] = list(df.apply(
                    lambda x: 1 if (len(x[obs_name + "_unigram"]) + len(x[target_name + "_unigram"])) == 0 else 1.0 * abs(len(x[obs_name + "_unigram"]) - len(x[target_name + "_unigram"])) / (len(x[obs_name + "_unigram"]) + len(x[target_name + "_unigram"])), axis=1))
                ## digit count 数字数量差
                df["count_of_%s_%s_subtract_%s" % (obs_name, "digit", target_name)] = list(df.apply(
                    lambda x: 1 if (count_digit(x[obs_name+"_unigram"]) + count_digit(x[target_name+"_unigram"])) == 0 else 1.0 * abs(count_digit(x[obs_name+"_unigram"]) - count_digit(x[target_name+"_unigram"])) / (count_digit(x[obs_name+"_unigram"]) + count_digit(x[target_name+"_unigram"])), axis=1))
                # ## word count 字母数量差
                # f["count_of_%s_%s_subtract_%s" % (obs_name, "letter", target_name)] = list(df.apply(
                #     lambda x: 1.0 * abs(len(x[obs_name]) - len(x[target_name])) / (len(x[obs_name]) + len(x[target_name])), axis=1))


    ##############################
    ## intersect word count ######
    ##############################
    print("generate intersect word counting features")
    #### unigram
    for gram in grams:
        for obs_name in feat_names:
            for target_name in feat_names:
                if target_name != obs_name:
                    ## query
                    df["count_of_%s_%s_in_%s"%(obs_name, gram, target_name)] = list(df.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1))   # 两特征单词相交的数量
                    df["ratio_of_%s_%s_in_%s"%(obs_name, gram, target_name)] = list(map(try_divide, df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df["count_of_%s_%s"%(obs_name,gram)]))   # 两特征单词相交的数量占比
コード例 #19
0
def extract_feat(df):
    ## unigram
    print "generate unigram"
    df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"]), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["product_description"]), axis=1))
    ## bigram
    print "generate bigram"
    join_str = "_"
    df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
    ## trigram
    print "generate trigram"
    join_str = "_"
    df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
    df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
    df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))


    ################################
    ## word count and digit count ##
    ################################
    print "generate word counting features"
    feat_names = ["query", "title", "description"]
    grams = ["unigram", "bigram", "trigram"]
    count_digit = lambda x: sum([1. for w in x if w.isdigit()])
    for feat_name in feat_names:
        for gram in grams:
            ## word count
            df["count_of_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1))
            df["count_of_unique_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1))
            df["ratio_of_unique_%s_%s"%(feat_name,gram)] = map(try_divide, df["count_of_unique_%s_%s"%(feat_name,gram)], df["count_of_%s_%s"%(feat_name,gram)])

        ## digit count
        df["count_of_digit_in_%s"%feat_name] = list(df.apply(lambda x: count_digit(x[feat_name+"_unigram"]), axis=1))
        df["ratio_of_digit_in_%s"%feat_name] = map(try_divide, df["count_of_digit_in_%s"%feat_name], df["count_of_%s_unigram"%(feat_name)])

    ## description missing indicator
    df["description_missing"] = list(df.apply(lambda x: int(x["description_unigram"] == ""), axis=1))


    ##############################
    ## intersect word count ##
    ##############################
    print "generate intersect word counting features"
    #### unigram
    for gram in grams:
        for obs_name in feat_names:
            for target_name in feat_names:
                if target_name != obs_name:
                    ## query
                    df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = list(df.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1))
                    df["ratio_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = map(try_divide, df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df["count_of_%s_%s"%(obs_name,gram)])

        ## some other feat
        df["title_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s"%gram])
        df["title_%s_in_query_div_query_%s_in_title"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s_in_title"%gram])
        df["description_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s"%gram])
        df["description_%s_in_query_div_query_%s_in_description"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s_in_description"%gram])


    ######################################
    ## intersect word position feat ##
    ######################################
    print "generate intersect word position features"
    for gram in grams:
        for target_name in feat_names:
            for obs_name in feat_names:
                if target_name != obs_name:
                    pos = list(df.apply(lambda x: get_position_list(x[target_name+"_"+gram], obs=x[obs_name+"_"+gram]), axis=1))
                    ## stats feat on pos
                    df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(np.min, pos)
                    df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(np.mean, pos)
                    df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(np.median, pos)
                    df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(np.max, pos)
                    df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(np.std, pos)
                    ## stats feat on normalized_pos
                    df["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
                    df["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] , df["count_of_%s_%s" % (obs_name, gram)])
コード例 #20
0
ファイル: model.py プロジェクト: anirag/Kaggle
def extract_feat(df_all):
    df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)
    df_all['len_of_title'] = df_all['product_title'].map(lambda x:len(x.split())).astype(np.int64)
    df_all['len_of_description'] = df_all['product_description'].map(lambda x:len(x.split())).astype(np.int64)
    df_all['len_of_brand'] = df_all['brand'].map(lambda x:len(x.split())).astype(np.int64)

    df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title']+"\t"+df_all['product_description']+"\t"+df_all['product_attributes']+"\t"+df_all['brand']+"\t"+df_all['color']+"\t"+df_all['appl']

    df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[1]))
    df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[2]))
    df_all['word_in_attributes'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[3]))
    df_all['word_in_brand'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[4]))
    df_all['word_in_color'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[5]))
    df_all['word_in_appl'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0],x.split('\t')[6]))

    
    df_all['ratio_title'] = df_all['word_in_title']/df_all['len_of_query']
    df_all['ratio_description'] = df_all['word_in_description']/df_all['len_of_query']
    df_all['ratio_attributes'] = df_all['word_in_attributes']/df_all['len_of_query']
    df_all['ratio_brand'] = df_all['word_in_brand']/df_all['len_of_query']
    df_all['ratio_color'] = df_all['word_in_color']/df_all['len_of_query']
    df_all['ratio_appl'] = df_all['word_in_appl']/df_all['len_of_query']


    df_all['cs_1'] = df_all['product_info'].map(lambda x:cs(x.split('\t')[0],x.split('\t')[1]))
    df_all['cs_2'] = df_all['product_info'].map(lambda x:cs(x.split('\t')[0],x.split('\t')[2]))
    df_all['cs_3'] = df_all['product_info'].map(lambda x:cs(x.split('\t')[1],x.split('\t')[2]))
    print "generate unigram"
    df_all["query_unigram"] = list(df_all.apply(lambda x: x["search_term"].lower().split(), axis=1))
    df_all["title_unigram"] = list(df_all.apply(lambda x: x["product_title"].lower().split(), axis=1))
    df_all["description_unigram"] = list(df_all.apply(lambda x: x["product_description"].lower().split(), axis=1))
 
    print "generate bigram"
    join_str = "_"
    df_all["query_bigram"] = list(df_all.apply(lambda x: ngram.getBigram(x["search_term"].split(), join_str), axis=1))
    df_all["title_bigram"] = list(df_all.apply(lambda x: ngram.getBigram(x["product_title"].split(), join_str), axis=1))
    df_all["description_bigram"] = list(df_all.apply(lambda x: ngram.getBigram(x["product_description"].split(), join_str), axis=1))
    ## trigram
    print "generate trigram"
    join_str = "_"
    df_all["query_trigram"] = list(df_all.apply(lambda x: ngram.getTrigram(x["search_term"].split(), join_str), axis=1))
    df_all["title_trigram"] = list(df_all.apply(lambda x: ngram.getTrigram(x["product_title"].split(), join_str), axis=1))
    df_all["description_trigram"] = list(df_all.apply(lambda x: ngram.getTrigram(x["product_description"].split(), join_str), axis=1))
    
    join_str = "X"
    # query unigram
    df_all["query_unigram_title_unigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["title_unigram"], join_str), axis=1))
    df_all["query_unigram_title_bigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["title_bigram"], join_str), axis=1))
    df_all["query_unigram_description_unigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["description_unigram"], join_str), axis=1))
    df_all["query_unigram_description_bigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_unigram"], x["description_bigram"], join_str), axis=1))
    # query bigram
    df_all["query_bigram_title_unigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["title_unigram"], join_str), axis=1))
    df_all["query_bigram_title_bigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["title_bigram"], join_str), axis=1))
    df_all["query_bigram_description_unigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["description_unigram"], join_str), axis=1))
    df_all["query_bigram_description_bigram"] = list(df_all.apply(lambda x: cooccurrence_terms(x["query_bigram"], x["description_bigram"], join_str), axis=1))
    
    
    
    print "generate word counting features"
    feat_names = ["query", "title","description"]
    grams = ["unigram","bigram", "trigram"]
    count_digit = lambda x: sum([1. for w in x if w.isdigit()])
    for feat_name in feat_names:
        for gram in grams:
            ## word count
                df_all["count_of_%s_%s"%(feat_name,gram)] = list(df_all.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1))
                df_all["count_of_unique_%s_%s"%(feat_name,gram)] = list(df_all.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1))
                df_all["ratio_of_unique_%s_%s"%(feat_name,gram)] = map(try_divide, df_all["count_of_unique_%s_%s"%(feat_name,gram)], df_all["count_of_%s_%s"%(feat_name,gram)])
    print "generate intersect word counting features"
    #### unigram
    for gram in grams:
        for obs_name in feat_names:
            for target_name in feat_names:
                 if target_name != obs_name:
                     ## query
                        df_all["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = list(df_all.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1))
                        df_all["ratio_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = map(try_divide, df_all["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df_all["count_of_%s_%s"%(obs_name,gram)])

                        
            ## some other feat
        df_all["title_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df_all["count_of_title_%s_in_query"%gram], df_all["count_of_query_%s"%gram])
        df_all["title_%s_in_query_div_query_%s_in_title"%(gram,gram)] = map(try_divide, df_all["count_of_title_%s_in_query"%gram], df_all["count_of_query_%s_in_title"%gram])
        #df_all["description_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df_all["count_of_description_%s_in_query"%gram], df_all["count_of_query_%s"%gram])
        #df_all["description_%s_in_query_div_query_%s_in_description"%(gram,gram)] = map(try_divide, df_all["count_of_description_%s_in_query"%gram], df_all["count_of_query_%s_in_description"%gram])

    print "generate intersect word position features"
    for gram in grams:
        for target_name in feat_names:
            for obs_name in feat_names:
                 if target_name != obs_name:
                     pos = list(df_all.apply(lambda x: get_position_list(x[target_name+"_"+gram], obs=x[obs_name+"_"+gram]), axis=1))
                     ## stats feat on pos
                     df_all["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(np.min, pos)
                     df_all["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(np.mean, pos)
                     df_all["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(np.median, pos)
                     df_all["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(np.max, pos)
                     df_all["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(np.std, pos)
                     ## stats feat on normalized_pos
                     df_all["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(try_divide, df_all["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], df_all["count_of_%s_%s" % (obs_name, gram)])
                     df_all["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(try_divide, df_all["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], df_all["count_of_%s_%s" % (obs_name, gram)])
                     df_all["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(try_divide, df_all["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], df_all["count_of_%s_%s" % (obs_name, gram)])
                     df_all["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(try_divide, df_all["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], df_all["count_of_%s_%s" % (obs_name, gram)])
                     df_all["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(try_divide, df_all["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] , df_all["count_of_%s_%s" % (obs_name, gram)])
    
    print "generate jaccard coef and dice dist for n-gram"
    dists = ["jaccard_coef", "dice_dist"]
    grams = ["bigram", "trigram"]
    feat_names = ["query", "title","description"]
    for dist in dists:
        for gram in grams:
            for i in range(len(feat_names)-1):
                for j in range(i+1,len(feat_names)):
                     target_name = feat_names[i]
                     obs_name = feat_names[j]
                     df_all["%s_of_%s_between_%s_%s"%(dist,gram,target_name,obs_name)] = \
                     list(df_all.apply(lambda x: compute_dist(x[target_name+"_"+gram], x[obs_name+"_"+gram], dist), axis=1))

    return df_all
def main():

    ###############
    ## Load Data ##
    ###############
    ## load data
    dataPath = "./ModelSystem/ProcessedData"
    columnNames = ["query", "title", "description"]
    catagories = ["train", "test"]

    for cata in catagories:
        for col in columnNames:
            path = "%s/%s_%s.pickle" % (dataPath, col, cata)
            with open(path, "rb") as f:
                input = pickle.load(f)

            sz = len(input)
            #开始1,2,3元文法
            output_unigram = []
            output_bigram = []
            output_trigram = []
            for i in range(sz):
                text = str(input[i])
                #去除标点
                wordList = text.split()

                unigram = wordList
                bigram = ngram.getBigram(wordList, "_")
                trigram = ngram.getTrigram(wordList, "_")

                for i in range(len(unigram)):
                    if (unigram[i] == "nan"): unigram[i] = ""

                for i in range(len(bigram)):
                    if (bigram[i] == "nan"): bigram[i] = ""

                for i in range(len(trigram)):
                    if (trigram[i] == "nan"): trigram[i] = ""

                output_unigram.append(unigram)
                output_bigram.append(bigram)
                output_trigram.append(trigram)

                #print(unigram)
                #print(bigram)
                #print(trigram)
                #raise Exception("sdf")

            path = "./ModelSystem/Features/ngram/%s_unigram_%s.pickle" % (col,
                                                                          cata)
            with open(path, "wb") as f:
                pickle.dump(output_unigram, f)

            path = "./ModelSystem/Features/ngram/%s_bigram_%s.pickle" % (col,
                                                                         cata)
            with open(path, "wb") as f:
                pickle.dump(output_bigram, f)

            path = "./ModelSystem/Features/ngram/%s_trigram_%s.pickle" % (col,
                                                                          cata)
            with open(path, "wb") as f:
                pickle.dump(output_trigram, f)

            print("%s_ngram_%s Completed" % (col, cata))

# ret = ngram.getBigram(x["query_unigram"], join_str)

    print("ngram All Done.")
コード例 #22
0
def extract_feat(df):
	join_str="_"
	df["query_unigram"] = list(df.apply(lambda x: preprocess_data(x["query"], stem=True), axis=1))
	df["title_unigram"] = list(df.apply(lambda x: preprocess_data(x["title"]), axis=1))
	df["description_unigram"] = list(df.apply(lambda x: preprocess_data(x["description"]), axis=1))
	df["attribute_values_unigram"] = list(df.apply(lambda x: preprocess_data(x["values"]), axis=1))
	df["query_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["query_unigram"], join_str), axis=1))
	df["title_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["title_unigram"], join_str), axis=1))
	df["description_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["description_unigram"], join_str), axis=1))
	df["attribute_values_bigram"] = list(df.apply(lambda x: ngram.getBigram(x["attribute_values_unigram"], join_str), axis=1))
	df["query_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["query_unigram"], join_str), axis=1))
	df["title_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["title_unigram"], join_str), axis=1))
	df["description_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["description_unigram"], join_str), axis=1))
	df["attribute_values_trigram"] = list(df.apply(lambda x: ngram.getTrigram(x["attribute_values_unigram"], join_str), axis=1))

	################################
	## word count and digit count ##
	################################
	print "generate word counting features"
	feat_names = ["query", "title", "description", "attribute_values"]
	grams = ["unigram", "bigram", "trigram"]
	count_digit = lambda x: sum([1. for w in x if w.isdigit()])
	for feat_name in feat_names:
		for gram in grams:
			## word count
			df["count_of_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1))
			df["count_of_unique_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1))
			df["ratio_of_unique_%s_%s"%(feat_name,gram)] = map(try_divide, df["count_of_unique_%s_%s"%(feat_name,gram)], df["count_of_%s_%s"%(feat_name,gram)])

		## digit count
		df["count_of_digit_in_%s"%feat_name] = list(df.apply(lambda x: count_digit(x[feat_name+"_unigram"]), axis=1))
		df["ratio_of_digit_in_%s"%feat_name] = map(try_divide, df["count_of_digit_in_%s"%feat_name], df["count_of_%s_unigram"%(feat_name)])

	## description missing indicator
	df["description_missing"] = list(df.apply(lambda x: int(x["description_unigram"] == ""), axis=1))
	#print "dropping unigrams bigrams trigrams"
	#df=df.drop(['query','description','title','values'], axis=1)                      	


 #    ##############################
 #    ## intersect word count ##
 #    ##############################

	# print "generate intersect word counting features"
	# #### unigram
	# for gram in grams:
	# 	for obs_name in feat_names:
	# 		for target_name in feat_names:
	# 			if target_name != obs_name:
 #                    ## query
	# 				df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = list(df.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1))
	# 				df["ratio_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = map(try_divide, df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df["count_of_%s_%s"%(obs_name,gram)])

	# 	## some other feat
	# 	df["title_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s"%gram])
	# 	df["title_%s_in_query_div_query_%s_in_title"%(gram,gram)] = map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s_in_title"%gram])
	# 	df["description_%s_in_query_div_query_%s"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s"%gram])
	# 	df["description_%s_in_query_div_query_%s_in_description"%(gram,gram)] = map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s_in_description"%gram])




	######################################
	## intersect word position feat ##
	######################################


	print "dropping unigrams bigrams trigrams"
	df=df.drop(['query','description','title','values'], axis=1)                      	


	print "generate intersect word position features"
	for gram in grams:
		for target_name in feat_names:
			for obs_name in feat_names:
				if target_name != obs_name:
					pos = list(df.apply(lambda x: get_position_list(x[target_name+"_"+gram], obs=x[obs_name+"_"+gram]), axis=1))
					## stats feat on pos
					df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(np.min, pos)
					df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(np.mean, pos)
					df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(np.median, pos)
					df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(np.max, pos)
					df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(np.std, pos)
					## stats feat on normalized_pos
					df["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
					df["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
					df["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
					df["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)])
					df["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = map(try_divide, df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] , df["count_of_%s_%s" % (obs_name, gram)])


	#print "dropping unigrams bigrams trigrams"
	df=df.drop(['query_unigram', 'title_unigram', 'description_unigram', 'query_bigram','title_bigram','description_bigram', 'query_trigram', 'title_trigram', 'description_trigram', 'attribute_values_unigram', 'attribute_values_bigram', 'attribute_values_trigram'], axis=1)                      	
	print "creating csv"
	df.to_csv("../../data/feat/test_countingfeat_part3.csv", header=True, index=False)