コード例 #1
0
def retrieveCatLabels(Y_out, training_data_file_name):
    training_data_file_name_prefix=training_data_file_name.split("-")[0]
    unrolled_data_file_name=training_data_file_name_prefix+explore.unrolled_file_name_suffix
    unrolled_data=explore.read_unrolled_data(unrolled_data_filename=unrolled_data_file_name)
    postings_sorted_by_freq_file=open(postings_sorted_by_freq_file_name)
    query_data=getQueryData(unrolled_data=unrolled_data, Y_out=Y_out, use_stemming=True)
    for index in range(len(query_data)):
        query_terms=query_data[index].query_terms
        postings_set=Set([])
        print "processing term ", index
        iteration=0
        while 1:
            print "iteration : ",iteration
            iteration+=1
            lines=postings_sorted_by_freq_file.readlines(100000000)
            if not lines:
                break
            print "initiating temp_dict"
            temp_postings_dict={}
            for line in lines:
                (token, postings)=(ps.stem(line.split(":")[1].lower().strip()), line.split(":")[2].lower().strip().split())
                temp_postings_dict[token]=postings
            #at this point, our temp dictionary is built 
            temp_postings_dict_keys=Set(temp_postings_dict.keys())
            print "completed building temp_dict. len(dict) = ", len(temp_postings_dict)
            for term in query_terms:
                if term in temp_postings_dict_keys:
                    if len(postings_set)==0:
                        postings_set=Set(temp_postings_dict[term])
                    else:
                        postings_set=postings_set & Set(temp_postings_dict[term])
        query_data[index].postings=list(postings_set)
        print "size of postings = ",len(query_data[index].postings), " postings : ", query_data[index].postings[0:10]
        raw_input("continue : ")
コード例 #2
0
ファイル: StanfordSystem.py プロジェクト: mailshanx/testApp
def gen_stanford_ner_training_data():
    '''
    use a BILOU annotations scheme: B=beginning, I=inside, L=last, O=outside, U=unit-length
    turns out, the BILOU scheme has ~ 25% F1 score on the stanford system
    '''
    train_test_split=0.25
    Y_ref=pickle.load(open('Y_ref.pkl','r'))
    unrolled_data_filename=open('training_unrolled_data.txt','r')
    stanford_ner_training_data_filename=open('stanford_ner_training_data_25_split.txt','w')
    stanford_ner_testing_data_filename=open('stanford_ner_testing_data.txt','w')
    unrolled_data=explore.read_unrolled_data(unrolled_data_filename=unrolled_data_filename)
    for index in range((len(Y_ref)-1)):
        if Y_ref[index]==0:
            label='O'
        elif Y_ref[index]==1 and Y_ref[index-1]==0 and Y_ref[index+1]==1:
            label='B'
        elif Y_ref[index]==1 and Y_ref[index-1]==1 and Y_ref[index+1]==1: 
            label='I'
        elif Y_ref[index]==1 and Y_ref[index-1]==1 and Y_ref[index+1]==0:
            label='L'
        elif Y_ref[index]==1 and Y_ref[index-1]==0 and Y_ref[index+1]==0:
            label='U'
        record=unrolled_data[index][3]+"\t"+label+"\n"
        if random.random()<train_test_split:
            stanford_ner_training_data_filename.write(record)
        stanford_ner_testing_data_filename.write(record)
コード例 #3
0
ファイル: FeatureFactory.py プロジェクト: mailshanx/testApp
def get_cprod_baseline_dict_mapping(input_file_name):
    '''
    whether a given token is a brandname, common english word or not found in dictionary supplied by the competition baseline2.
    '''
    cprod_baseline_dict_mapping_prods={'brandname':1.0, 'merchant':2.0} #if not found, mark it as -1.0.
    cprod_baseline_dict_mapping_lang={'encommonword':1.0, 'grammaticalword':2.0}
    input_file_name_prefix=input_file_name.split("-")[0]
    unrolled_file_name_suffix='_unrolled_data.txt'
    unrolled_file_name=input_file_name_prefix+unrolled_file_name_suffix
    unrolled_data=explore.read_unrolled_data(unrolled_data_filename=unrolled_file_name)
    cprod_data=read_n_filter_cprod_baseline_dict()
    cprod_keys=Set(cprod_data.keys())
    X_prods=np.ones(len(unrolled_data))*-1.0    #the default assumption is that the avg token is not a brandname or merchant name
    X_lang=np.zeros(len(unrolled_data))         #if token is not found in dictionary, it is likely close to a common english word
    for index, item in enumerate(unrolled_data):
        (textID, offset, lineNo, token)=item
        token=token.lower().strip()
        if token in cprod_keys:
            label=cprod_data[token]
            if label in cprod_baseline_dict_mapping_prods:
                X_prods[index]=cprod_baseline_dict_mapping_prods[label]
            if label in cprod_baseline_dict_mapping_lang:
                X_lang[index]=cprod_baseline_dict_mapping_lang[label]   
    return (X_prods, X_lang)