예제 #1
0
def updata_searchengine(Knowledge,SeIndexDir):
    #remove old index
    os.system('rm -rf %s' % SeIndexDir)
    os.system('mkdir %s' % SeIndexDir)

    QuestionNorm = []
    for KID,item in Knowledge.items():
        for question in item[0]:
            if int(question.IsNorm) == 1:
                QuestionNorm.append(question)

    question = []
    for item in QuestionNorm:
        mid = {}
        mid['question'] = item.Question
        mid['questionID'] = item.QuestionID
        question.append(mid)

    schema = Schema(title=TEXT(stored=True),quesId=ID(stored=True))
    ix = create_in(SeIndexDir, schema)

    writer = ix.writer()
    for i in question:
        word = PProc.cut_for_search(i['question'])
        PProc.syn_wordlist(word)
        writer.add_document(title=u' '.join(word),quesId = u'%s' % i['questionID'])
    writer.commit()
def processConversation(conversation):
    global bag_of_words
    bag_of_words = {}
    sentences = conversation.split(".")
    tokenized = PreProcess.tokenize_sentences(sentences)
    filtered = PreProcess.RemovePunctAndStopWords(tokenized)
    bag_of_words = FreqDist(word.lower() for word in filtered)
예제 #3
0
def ProcessRow_chat(row):
    result = []
    for question in row:
        filter = [u',',u'?',u'。',u';',u'!',u'“',u'”',u'’',u'‘',u',',u'.',u'!','?']
        for i in filter:
            question = question.replace(i, '')

        lenOfQ = len(question)
        if lenOfQ == 0:
            continue
        listOfWildcard = []
        for i in range(lenOfQ):
            if question[i] == '=':
                listOfWildcard.append(i)

        word,tag = PProc.withtag_cut(question)

        for idx in listOfWildcard:
            imid = 0
            for idxOfword,item in enumerate(word):
                if imid == idx:
                    word.insert(idxOfword, u'=')
                    tag.insert(idxOfword, 'wc')
                    break
                imid += len(item)

        word, tag = PProc.wordtag_process(word, tag)
        #print ' '.join(word)
        result += QuestionGet_chat(word, tag)

    return result
예제 #4
0
def WriteAiml_how(knowledge,string_how,ID, fw_match):
    if len(knowledge) < 2:
        return
    question = []
    for item in knowledge:
        question.append(item.Question)

    parm = question[1]
    key = parm.split('>')[0]
    verb = parm.split('>')[1]
    keyList = PProc.withtag_cut(key)[0]
    PProc.syn_wordlist(keyList)
    verb = PProc.syn_word(verb)
    if '0' not in keyList:
        stringx = string_how.replace('KNOW', (u'KNOWLEDGE %s' % ID).encode('utf-8'))
        stringx = stringx.replace('KEY', ' '.join(keyList).encode('utf-8'))
        stringx = stringx.replace('VERB', verb.encode('utf-8'))
        stringx = stringx.replace('VALUE', (u'match-what|%s' %ID).encode('utf-8'))
        fw_match.write(stringx)
    else:
        keyList[keyList.index('0')] = '*'
        stringx = string_how.replace('KNOW', (u'KNOWLEDGE %s' % ID).encode('utf-8'))
        stringx = stringx.replace('KEY', ' '.join(keyList).encode('utf-8'))
        stringx = stringx.replace('VERB', verb.encode('utf-8'))
        stringx = stringx.replace('VALUE', (u'match-what|%s' % ID).encode('utf-8'))
        fw_match.write(stringx)
        keyList.remove('*')
        stringx = string_how.replace('KNOW', (u'KNOWLEDGE %s' % ID).encode('utf-8'))
        stringx = stringx.replace('KEY', ' '.join(keyList).encode('utf-8'))
        stringx = stringx.replace('VERB', verb.encode('utf-8'))
        stringx = stringx.replace('VALUE', (u'match-what|%s' % ID))
        fw_match.write(stringx)
예제 #5
0
def WriteAiml_what(knowledge, string_what, ID, fw_match):
    if len(knowledge) < 2:
        return
    key = knowledge[1].Question
    keyList, tag = PProc.withtag_cut(key)
    PProc.syn_wordlist(keyList)
    stringx = string_what.replace('KNOW', (u'KNOWLEDGE %s' % ID).encode('utf-8'))
    stringx = stringx.replace('KEY', ' '.join(keyList).encode('utf-8'))
    stringx = stringx.replace('VALUE', (u'match-what|%s' % ID).encode('utf-8'))
    fw_match.write(stringx)
예제 #6
0
def main():
    vocab = pp.initVocab()
    output = pp.indexToLabel("output.mat", vocab)
    labelfile = "tag.txt"
    f = open(labelfile, "wb")
    for i in range(len(output)):
        prob = output['data'][:][i]
        idx = np.argmax(prob)
        tag = vocab[i]
        f.write(tag + '\n')
    f.close()

    f = open(labelfile, "wb")
예제 #7
0
def processConversation(conversation, category):
    global bag_of_words, documentClass
    bag_of_words = {}
    sentences = conversation.split(".")
    tokenized = PreProcess.tokenize_sentences(sentences)
    filtered = PreProcess.RemovePunctAndStopWords(tokenized)
    for word in filtered:
        if word in bag_of_words:
            bag_of_words[word] = int(bag_of_words[word]) + 1
        else:
            bag_of_words[word] = 1
    #total=len(filtered)
    #bag_of_words=calculateFrequencies(total)
    addTermFrequency(bag_of_words)
예제 #8
0
def Process1(df):
    pri_id = "企业名称"
    res = pd.DataFrame()
    res[pri_id] = df[pri_id].unique()
    # 转换币种
    df = prep.Convert_money(df)
    # 提取注册资金特征(最大值,最小值,均值,方差)
    res = pd.merge(res,fea.GetValAvg(df,pri_id,"注册资金(元)"),on=pri_id)
    res = pd.merge(res,fea.GetValMaxMin(df,pri_id,"注册资金(元)"),on=pri_id)
    res = pd.merge(res,fea.GetValVar(df,pri_id,"注册资金(元)"),on=pri_id)

    # 提取类别特征
    num_fea = ['注册资金(元)',"出资比例"]
    cat_fea = [col for col in df.columns if col != pri_id and col not in num_fea]
    for col in cat_fea:
        res = pd.merge(res,fea.GetCategroicalCount(df,pri_id,col),on=pri_id)

    # 法定代表人和首席代表标志为空统计
    res = pd.merge(res,fea.GetValNaCount(df,pri_id,"法定代表人标志","姓名"),on=pri_id)
    res = pd.merge(res,fea.GetValNaCount(df,pri_id,"首席代表标志","姓名"),on=pri_id)

    # 统计 相应职务个树
    res = pd.merge(res,fea.CatRowsToCols(df,pri_id,"职务","姓名"))

    # 提取出资比例(最大值,最小值,均值,方差)
    res = pd.merge(res,fea.GetValAvg(df,pri_id,"出资比例"),on=pri_id)
    res = pd.merge(res,fea.GetValMaxMin(df,pri_id,"出资比例"),on=pri_id)
    res = pd.merge(res,fea.GetValVar(df,pri_id,"出资比例"),on=pri_id)

    return res
예제 #9
0
 def preProc(self):
     dataCleaner = PreProcess(self.df)
     self.df = dataCleaner.df
     #alert user
     tkMessageBox.showinfo("K Means Clustering",
                           "Preprocessing completed successfully!")
     pass
예제 #10
0
def top_k_words_of(platform,
                   k=20,
                   cmt_num=0,
                   by_category=False,
                   cate_name="",
                   show_cmt_of_words=False):
    cmt_corpus = PreProcess.get_review_corpus_by(platform=platform,
                                                 num=cmt_num,
                                                 by_category=by_category,
                                                 cate_name=cate_name)
    tpk_words1 = top_k_fre_of(cmt_corpus, k=20)
    tpk_words2 = tf_idf_topk(cmt_corpus, k=20)
    if show_cmt_of_words:
        sentence_list = []
        for cmt in cmt_corpus:
            sentence_list += cut_ch_sentence(cmt)
        from collections import defaultdict
        words_comment_dir1 = defaultdict(list)
        wc_dir2 = defaultdict(list)
        for words in tpk_words1:
            for sentence in filter(lambda sent: words in sent, sentence_list):
                words_comment_dir1[words].append(sentence)
        for words in tpk_words2:
            for sentence in filter(lambda sent: words in sent, sentence_list):
                wc_dir2[words].append(sentence)
        for key in words_comment_dir1.keys():
            print("------------------%s------------------\n" % key,
                  words_comment_dir1[key], "\n")
        print("=========================================")
        for key in wc_dir2.keys():
            print("-------------------%s------------------\n" % key,
                  wc_dir2[key], "\n")
예제 #11
0
def get_aspect_json(platform,
                    k=20,
                    cmt_num=0,
                    by_category=False,
                    cate_name=""):
    import json
    cmt_corpus = PreProcess.get_review_corpus_by(platform=platform,
                                                 num=cmt_num,
                                                 by_category=by_category,
                                                 cate_name=cate_name)
    tpk_words = tf_idf_topk(cmt_corpus, k=20)
    sentence_list = []
    for cmt in cmt_corpus:
        sentence_list += cut_ch_sentence(cmt)
    words_tree = dict()
    words_tree["name"] = cate_name
    words_tree["child"] = []
    for words in tpk_words:
        comment_dic = dict()
        comment_dic["child"] = []
        comment_dic["name"] = words
        for sentence in filter(lambda sent: words in sent, sentence_list):
            comment_dic["child"].append(sentence)
        words_tree["child"].append(comment_dic)
    return json.dumps(words_tree)
예제 #12
0
def _F_Clsuter_Geo():
    if os.path.exists(data_path + "data/_F_geo.feather"):
        df = feather.read_dataframe(data_path + "data/_F_geo.feather")
        return df
    # 合并 operation和transaction的uid,geo_code
    geo_info = pd.concat(
        (op_info[[pri_id, 'geo_code']], trans_info[[pri_id, 'geo_code']]))
    geo_info['pos'] = geo_info['geo_code'].apply(_F.Decode)

    temp = geo_info[geo_info['pos'] != -1]

    res = [x for x in temp['pos'].values]
    X = np.asarray(res)
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=20, random_state=2018).fit(X)
    temp['cluster_id'] = kmeans.labels_

    t = temp.groupby([
        'UID', 'cluster_id'
    ])['pos'].count().reset_index().rename(columns={'pos': 'cluster_count'})
    c = pd.pivot_table(t,
                       index='UID',
                       columns='cluster_id',
                       values='cluster_count').fillna(0).reset_index()
    # 重命名列
    _Prep = _P.Process()
    c = _Prep.RenameColumns(c, [pri_id], 'cluster')
    # 持久化
    feather.write_dataframe(c, data_path + "data/_F_geo.feather")
    return c
예제 #13
0
def convert(src, tgt, txt, nativize, preoptions, postoptions):
    txt = PreProcess.PreProcess(txt, src, tgt)

    if 'siddhamUnicode' in postoptions and tgt == 'Siddham':
        tgt = 'SiddhamUnicode'
    if 'LaoNative' in postoptions and tgt == 'Lao':
        tgt = 'Lao2'
    if 'siddhamUnicode' in preoptions and src == 'Siddham':
        src = 'SiddhamUnicode'
    if 'egrantamil' in preoptions and src == 'Grantha':
        src = 'GranthaGrantamil'
    if 'egrantamil' in postoptions and tgt == 'Grantha':
        tgt = 'GranthaGrantamil'

    for options in preoptions:
        txt = getattr(PreProcess, options)(txt)

    transliteration = Convert.convertScript(txt, src, tgt)

    if nativize:
        transliteration = PostOptions.ApplyScriptDefaults(
            transliteration, src, tgt)
        if tgt != 'Tamil':
            transliteration = PostProcess.RemoveDiacritics(transliteration)
        else:
            transliteration = PostProcess.RemoveDiacriticsTamil(
                transliteration)

    for options in postoptions:
        transliteration = getattr(PostProcess, options)(transliteration)

    return transliteration
예제 #14
0
def removeRibosomalRNA(fastq1, outfile):
    '''Remove ribosomal RNA using sortMeRNA'''

    if PARAMS['data_type'] == 'metatranscriptome':
        tool = pp.runSortMeRNA(
            fastq1, outfile, **{
                **PARAMS,
                **{
                    'fn_suffix': '_deadapt.' + FASTQ1_SUFFIX
                }
            })
        tool.run(**PARAMS)
    else:
        assert PARAMS['data_type'] == 'metagenome', \
            'Unrecognised data type: {}'.format(PARAMS['data_type'])

        inf1 = fastq1
        inf2 = P.snip(inf1, '.fastq.1.gz') + '.fastq.2.gz'
        inf3 = P.snip(inf1, '.fastq.1.gz') + '.fastq.3.gz'

        outf1 = outfile
        outf2 = P.snip(outf1, '.fastq.1.gz') + '.fastq.2.gz'
        outf3 = P.snip(outf1, '.fastq.1.gz') + '.fastq.3.gz'

        symlink(inf1, outf1)
        if os.path.exists(inf2):
            symlink(inf2, outf2)
        if os.path.exists(inf3):
            symlink(inf3, outf3)
예제 #15
0
def assignment_fairea(cands,
                      pos,
                      fitness,
                      G,
                      weight_probability=[1, 0, 0],
                      version=4,
                      kk=0,
                      local=False):

    matched_1 = PP.pre_assignment(cands, pos, fitness, G)
    if weight_probability[2] == 1:
        if kk != 0:
            matched_1.extend(
                PP.support_group_assignment(cands, pos, fitness, G, p=kk))
        else:
            matched_1.extend(
                PP.support_group_assignment(cands, pos, fitness, G))
    i = -1
    while True:
        i += 1
        positions, final_matched = select_positions(G,
                                                    pos,
                                                    cands,
                                                    fitness,
                                                    weight_probability,
                                                    i + 1,
                                                    version=version,
                                                    local=local)
        if len(final_matched) < len(positions):
            final_matched, G = Hu.Hungarian(
                cands,
                positions,
                final_matched,
                G,
                fitness,
                weight_probability=weight_probability,
                version=2,
                local=local)
            if final_matched == []:
                return [], []

        if len(final_matched) == len(pos):
            break
    final_matched = list(final_matched)
    final_matched.extend(matched_1)

    return G, set(final_matched)
def SaveToFolder(gtImage,sliceNum, imgNumFolder):
    sampledImg = preProc.SampleTest1(gtImage[:,:,sliceNum])
    #print(sampledImg.GetSpacing())
    sampledImgArr = sitk.GetArrayFromImage(sampledImg)
    #slice1Copy = np.uint8(sampledImgArr)
    path = './TrainingImages'
    cv2.imwrite(os.path.join(path , 'testImage{0}.png'.format(imgNumFolder)), sampledImgArr)
    cv2.waitKey(0)
예제 #17
0
def assignment_max_weight(cands,
                          pos,
                          fitness,
                          G,
                          weight_probability=[1, 0, 0],
                          version=3):
    matched_1 = PP1.pre_assignment(cands, pos, fitness, G)
    if weight_probability[2] == 1:
        matched_1.extend(PP1.support_group_assignment(cands, pos, fitness, G))
    bi_G = nx.Graph()
    bi_edges, edge_weights, l = UW.p_based_weight(pos, cands, fitness, G)
    bi_G.add_weighted_edges_from(bi_edges)
    final_matched = nx.max_weight_matching(bi_G)
    final_matched = list(final_matched)
    final_matched.extend(matched_1)
    final_matched = set(final_matched)

    return G, final_matched
예제 #18
0
def processConversation(conversation,category):
	global bag_of_words,documentClass
	bag_of_words={}
	sentences=conversation.split(".")
	tokenized=PreProcess.tokenize_sentences(sentences)
	filtered=PreProcess.RemovePunctAndStopWords(tokenized)
	
	for word in filtered:
		if word in bag_of_words:
			bag_of_words[word]=int(bag_of_words[word])+1
		else:
			bag_of_words[word]=1
	total=len(filtered)
	bag_of_words=calculateFrequencies(total)
	if category in documentClass:
			new_dict=merge_two_dicts(documentClass[category],bag_of_words)
			documentClass[category]=new_dict
	else:
		documentClass[category]=bag_of_words
def application(file_path):
    data = PP.image_process(file_path)
    lable = ''
    if (len(data) == 0):
        print("识别失败,请传入更清晰的图片")
    else:
        print("正在识别......")
        for i in range(len(data)):
            preValue = restore_model(data[i:i + 1])[0]
            lable += str(preValue)
        print("识别结果:" + lable)
예제 #20
0
def Encode(input_dir, output_dir, codebook_name, pixel_size, start):
    global codebook
    with open(codebook_name, 'r') as f:
        codebook = f.read()
        codebook = ast.literal_eval(codebook)

    num = 1
    PreProcess.dir_check(output_dir, emptyflag=True)
    compress_rate = []
    for f in os.listdir(input_dir):
        img_path = os.path.join(input_dir, f)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        ret, img = cv2.threshold(img, 0.5, 1, cv2.THRESH_BINARY)
        height = len(img)
        width = len(img[0])

        codevalue = encode(img, height=height, width=width)
        codevalue = dict(
            sorted(codevalue.items(),
                   key=lambda item: (item[0][0], item[0][1])))
        codevalue = tobinary(codevalue, height, width)
        output_path = os.path.join(output_dir, f[0:f.rfind('.bmp')]) + '.tt'
        original_pixel = height * width * len(format(pixel_size - 1, 'b'))
        final_pixel = len(codevalue)

        with open(output_path, 'wb') as g:
            g.write(codevalue.encode())
        end = datetime.datetime.now()

        compress_rate.append(original_pixel / final_pixel)

        print(
            '\rSaving encoding results for picture %d, program has run %s, the mean compression ratio is %0.2f'
            % (num, end - start, np.mean(compress_rate)),
            end='')
        num = num + 1
    return np.mean(compress_rate)
예제 #21
0
def Decode(input_dir, output_dir, original_img_dir, codebook_name, start):
    with open(codebook_name, 'r') as f:
        codebook = f.read()
        codebook = ast.literal_eval(codebook)

    global decodebook
    decodebook = {v: k for k, v in codebook.items()}

    num = 1
    PreProcess.dir_check(output_dir, emptyflag=True)

    error_rate_total = []
    for f in os.listdir(input_dir):
        tt_path = os.path.join(input_dir, f)
        if os.path.splitext(tt_path)[1] == '.tt':
            with open(tt_path, 'rb') as g:
                tt = g.read()
            img = decode(tt)
            ret, img = cv2.threshold(img, 0.5, 255, cv2.THRESH_BINARY)
            img_original_path = os.path.join(original_img_dir,
                                             f[0:f.rfind('.tt')]) + '.bmp'
            img_original = cv2.imread(img_original_path, cv2.IMREAD_GRAYSCALE)

            output_path = os.path.join(output_dir,
                                       f[0:f.rfind('.tt')]) + '.bmp'
            cv2.imwrite(output_path, img)

            error_rate = fidelity(img_original, img)
            error_rate_total.append(error_rate)

            end = datetime.datetime.now()
            print(
                '\rSaving decoding results for picture %d,SNR is %0.2f,the mean SNR is %0.2f, the program has run %s'
                % (num, error_rate, np.mean(error_rate_total), end - start),
                end='')
            num = num + 1
    return np.mean(error_rate_total)
def pre_process():
    global resized, processed_images

    try:
        cut_images = CutUp.box_extraction(resized)
        resized = None
    except IndexError:
        resized = None
        return 'We couldn\'t detect all of the gridlines.'

    for cut_image in cut_images:
        processed_images.append(
            PreProcess.pre_process(cut_image, b=7, by_mass=False, boundary=8))

    return 'success'
예제 #23
0
def _F_GeoCode(encode_type="LabelEncode", n=3):
    if os.path.exists(data_path + "data/_F_geo_code.feather"):
        df = feather.read_dataframe(data_path + "data/_F_geo_code.feather")
        return df
    # 取每个用户经常活跃的topN geo_code
    geo_info = pd.concat(
        (op_info[[pri_id, 'geo_code',
                  'day']], trans_info[[pri_id, 'geo_code', 'day']]))
    temp = _F.TopNGeo_code(geo_info, pri_id, 'day', n)
    # 编码
    _Prep = _P.Process()
    temp = _Prep.CatColConvert(temp, pri_id, encode_type)
    # 持久化
    feather.write_dataframe(temp, data_path + "data/_F_geo_code.feather")
    return temp
예제 #24
0
    def learn(self, text_df):
        """Spark transformation to learn the adjacent terms of a given ngram"""

        ngram = NGram(n=self.n, inputCol='tokenized_text', outputCol='ngram')
        ngram_df = ngram.transform(text_df)
        # create the ngram to adjacent term mappings
        ngram_list = ngram_df.select("ngram").rdd.map(lambda r: r['ngram']).collect()
        self.ngram_model = ngram_df.rdd \
            .map(lambda x: PreProcess.generate_adjacent_terms(x.asDict()['ngram'])) \
            .flatMap(lambda xs: [x for x in xs]) \
            .map(lambda y: (y[0], [y[1]])) \
            .reduceByKey(lambda a, b: a + b).collect()

        # create list of the keys in the model and store them
        self.model_keys = self.ngram_model.map(lambda x: x[0]).collect()
def update(loss, reviews):
    global model
    lr = 0.000003
    #update 'W' parameter of model
    for (userid, productid) in loss:
        for word in reviews[(userid, productid)][:-1]:
            model["W"][words.index(word)] -= lr * loss[(userid, productid)]

    #update 'U' parameter of model
    for user in users:
        userid = users[user]
        productlist = pre.getproductlist(userid)
        for product in productlist:
            factor = np.zeros(D)
            for word in reviews[(userid, product)][:-1]:
                factor += model["P"][words.index(word)]
            model["U"][userid] -= lr * loss[
                (userid, product)] * (factor * model["V"][product])

    #update 'V' parameter of model
    for product in products:
        productid = products[product]
        userlist = pre.getreviewers(productid)
        for user in userlist:
            factor = np.zeros(D)
            for word in reviews[(user, productid)][:-1]:
                factor += model["P"][words.index(word)]
            model["V"][productid] -= lr * loss[
                (user, productid)] * (factor * model["U"][user])

    #update 'P' parameter of model
    for wordid in range(len(words)):
        for (userid, productid) in reviews:
            if words[wordid] in reviews[(userid, productid)][:-1]:
                model["P"][wordid] -= lr * loss[(userid, productid)] * (
                    model["U"][userid] * model["V"][productid])
예제 #26
0
def convert(src, tgt, txt, nativize, preoptions, postoptions):
    txt = PreProcess.PreProcess(txt, src, tgt)

    if 'siddhammukta' in postoptions and tgt == 'Siddham':
        tgt = 'SiddhamDevanagari'
    if 'siddhamap' in postoptions and tgt == 'Siddham':
        tgt = 'SiddhamDevanagari'
    if 'siddhammukta' in preoptions and src == 'Siddham':
        src = 'SiddhamDevanagari'
    if 'LaoNative' in postoptions and tgt == 'Lao':
        tgt = 'Lao2'
    if 'egrantamil' in preoptions and src == 'Grantha':
        src = 'GranthaGrantamil'
    if 'egrantamil' in postoptions and tgt == 'Grantha':
        tgt = 'GranthaGrantamil'
    if 'nepaldevafont' in postoptions and tgt == 'Newa':
        tgt = 'Devanagari'
    if 'ranjanalantsa' in postoptions and tgt == 'Ranjana':
        tgt = 'Tibetan'
        nativize = False
    if 'ranjanawartu' in postoptions and tgt == 'Ranjana':
        tgt = 'Tibetan'
        nativize = False

    for options in preoptions:
        txt = getattr(PreProcess, options)(txt)

    transliteration = Convert.convertScript(txt, src, tgt)

    if nativize:
        transliteration = PostOptions.ApplyScriptDefaults(
            transliteration, src, tgt)
        if tgt != 'Tamil':
            transliteration = PostProcess.RemoveDiacritics(transliteration)
        else:
            transliteration = PostProcess.RemoveDiacriticsTamil(
                transliteration)

    for options in postoptions:
        transliteration = getattr(PostProcess, options)(transliteration)

    if src == "Tamil" and tgt == "IPA":
        r = requests.get("http://anunaadam.appspot.com/api?text=" + txt +
                         "&method=2")
        r.encoding = r.apparent_encoding
        transliteration = r.text

    return transliteration
예제 #27
0
def setup():
    global testParagraphs
    global trainingParagraphs
    global happySadScoredWords
    testParagraphs = []
    trainingParagraphs = []
    happySadScoredWords = []
    print("Loading Corpus...")
    testParagraphs, trainingParagraphs = PreProcess.getRatedParagraphs()
    print(len([r for r in trainingParagraphs if r["overAllRating"] == 1]))
    print(len([r for r in trainingParagraphs if r["overAllRating"] == 2]))
    print(len([r for r in trainingParagraphs if r["overAllRating"] == 3]))
    print(len([r for r in trainingParagraphs if r["overAllRating"] == 4]))
    print(len([r for r in trainingParagraphs if r["overAllRating"] == 5]))
    print("Loading Happy/Sad Words...")
    happySadScoredWords = HappySad.loadHSWords("./words/happyAndSadWords3.txt")
예제 #28
0
    def chooseFile(self, item):
        for index in range(self.listWidget.count()):
            if self.listWidget.item(index).text() == item.text():
                self.itemIndex = index
        preProcess = PreProcess.PreProcess()
        content = preProcess.getArticleContent(repertory + "/" + item.text())
        if self.method == 1:
            self.sents = preProcess.getSents(content)
            size = len(self.sents)
        else:
            size, self.sents = preProcess.getXMLsents(content)

        self.labelRest.setText('0/' + str(size))
        self.file = item.text()
        self.newSent = []
        self.pushButton_save.setDisabled(True)
예제 #29
0
def setup():
    global testParagraphs
    global trainingParagraphs
    global happySadScoredWords
    testParagraphs = [] 
    trainingParagraphs = []
    happySadScoredWords = []
    print("Loading Corpus...")
    testParagraphs, trainingParagraphs = PreProcess.getRatedParagraphs()
    print(len([r for r in trainingParagraphs if r["overAllRating"] == 1]))
    print(len([r for r in trainingParagraphs if r["overAllRating"] == 2]))
    print(len([r for r in trainingParagraphs if r["overAllRating"] == 3]))
    print(len([r for r in trainingParagraphs if r["overAllRating"] == 4]))
    print(len([r for r in trainingParagraphs if r["overAllRating"] == 5]))
    print("Loading Happy/Sad Words...")
    happySadScoredWords = HappySad.loadHSWords("./words/happyAndSadWords3.txt")
예제 #30
0
def RunPreprocess():

    print "---PreProcess"
    PreProcess.PreProcess()
    print "---PreProcess1"
    PreProcess1.PreProcess1()
    print "---PreProcess2"
    PreProcess2.PreProcess2()
    print "---PreProcess3"
    PreProcess3.PreProcess3()
    print "---PreProcess4,40"
    PreProcess4.PreProcess4(40)
    print "---PreProcess4,30"
    PreProcess4.PreProcess4(30)
    print "---PreProcess4Base,40"
    PreProcess4Base.PreProcess4Base(40)
    print "---PreProcess4Base,30"
    PreProcess4Base.PreProcess4Base(30)
def match(gen, fitness, cand, cands, pos, m ,G):
    final_matched =[]
    gender = nx.get_node_attributes(G, 'att')
    edges = []
    bi_G = nx.Graph()
    for (u, v), w in fitness.items():
        if v in cand and cands[u] == gen:
            edges.append((u, v, w))
    bi_G.add_weighted_edges_from(edges)
    matched = nx.max_weight_matching(bi_G)
    l = {}
    for (u, v) in matched:
        if u in cand:
            temp = u
            u = v
            v = temp
        l[(u, v)] = fitness[(u, v)]
    l = {k: v for k, v in sorted(l.items(), key=lambda item: item[1], reverse=True)}
    l = list(l.keys())

    i,j = 0, 0
    while j<=len(l)-1 and i < m:
        (u,v) = l[j]
        j+=1
        if u in cands:
            temp = u
            u = v
            v = temp
        if u in pos and v in cands:
            final_matched.append((u,v))
            i+=1
            gender[u] = gen
            pos.remove(u)
            del cands[v]
            remove_list = []
            for (a, b) in fitness.keys():
                if a == v or b == u:
                    remove_list.append((a, b))
            for item in remove_list:
                del fitness[item]
            nx.set_node_attributes(G, gender, 'att')
            final_matched.extend(PP.pre_assignment(cands, pos, fitness, G))

    return final_matched
예제 #32
0
def setup():
    global testReviews
    global trainingReviews
    global Iclassifiers
    global IEx1features
    global IIclassifiers
    global IIEx1features
    print("Loading Corpus...")
    testReviews, trainingReviews = PreProcess.getRatedReviews()
    print(len([r for r in trainingReviews if r["overAllRating"] == 1]))
    print(len([r for r in trainingReviews if r["overAllRating"] == 2]))
    print(len([r for r in trainingReviews if r["overAllRating"] == 3]))
    print(len([r for r in trainingReviews if r["overAllRating"] == 4]))
    print(len([r for r in trainingReviews if r["overAllRating"] == 5]))

    print("IGNORE////////////////")
    #Get the classifier from Exercise 1 to compute rating for each paragraph
    Iclassifiers, IEx1features = Exercise1.partI(ClassifierRunner.naiveBayes)
    #Get the classifier from Exercise 1 to compute rating for each paragraph
    IIclassifiers, IIEx1features = Exercise1.partII(ClassifierRunner.maxEnt)
    print("END IGNORE////////////////")
예제 #33
0
def setup():
    global testReviews
    global trainingReviews
    global Iclassifiers
    global IEx1features
    global IIclassifiers
    global IIEx1features
    print("Loading Corpus...")
    testReviews, trainingReviews = PreProcess.getRatedReviews()
    print(len([r for r in trainingReviews if r["overAllRating"] == 1]))
    print(len([r for r in trainingReviews if r["overAllRating"] == 2]))
    print(len([r for r in trainingReviews if r["overAllRating"] == 3]))
    print(len([r for r in trainingReviews if r["overAllRating"] == 4]))
    print(len([r for r in trainingReviews if r["overAllRating"] == 5]))

    print("IGNORE////////////////")
    #Get the classifier from Exercise 1 to compute rating for each paragraph
    Iclassifiers, IEx1features = Exercise1.partI(ClassifierRunner.naiveBayes)
    #Get the classifier from Exercise 1 to compute rating for each paragraph
    IIclassifiers, IIEx1features = Exercise1.partII(ClassifierRunner.maxEnt)
    print("END IGNORE////////////////")
예제 #34
0
def partI(classifier):
    print("PART I Classify by author")
    print("Loading Corpus...")
    testReviews, trainingReviews = PreProcess.getByAuthor()
    authorTagTraining = [(e["text"], e["author"]) for e in trainingReviews] 
    authorTagTesting = [(e["text"], e["author"]) for e in testReviews]
    featureExtractors = []
    if classifier == ClassifierRunner.naiveBayes:
        featureExtractors.append(HappySad.featureNumericScore)
        featureExtractors.append(HappySad.featureHitCountBucketed)
        featureExtractors.append(AuthorshipFeatures.typeTokenRatioBucketed)
        featureExtractors.append(AuthorshipFeatures.vocabSizeBucketed)
    else:
        featureExtractors.append(HappySad.featureNumericScore)
        featureExtractors.append(HappySad.featureHitCount)
        featureExtractors.append(AuthorshipFeatures.typeTokenRatio)
        featureExtractors.append(AuthorshipFeatures.vocabSize)

    #BASELINE RUN
    print("Running Baseline")
    trainedBaseline = ClassifierRunner.runNfoldCrossValidation(ClassifierRunner.mostCommonTag, authorTagTraining, featureExtractors, 4)
    predictionsBaseline = [c[2] for c in trainedBaseline]
    truthsBaseline = [c[3] for c in trainedBaseline]
    predictionsTesting,bAcc = ClassifierRunner.predictTagged(trainedBaseline[0][0], featureExtractors, authorTagTesting)
    truthsTesting = [c[1] for c in authorTagTesting]
    bRMS = Evaluator.rmsBinaryDifference(predictionsTesting, truthsTesting)
    print("BaseLine RMS Error:", bRMS)

    #OUR CLASSIFIER RUN
    trainedClassifiers = ClassifierRunner.runNfoldCrossValidation(classifier, authorTagTraining, featureExtractors, 4)
    predictions = [c[2] for c in trainedClassifiers]
    truths = [c[3] for c in trainedClassifiers]
    print("Running most accurate trained classifier on test set")
    predictionsTesting, cAcc = ClassifierRunner.predictTagged(trainedClassifiers[0][0], featureExtractors, authorTagTesting)
    truthsTesting = [c[1] for c in authorTagTesting]
    cRMS = Evaluator.rmsBinaryDifference(predictionsTesting, truthsTesting)
    Evaluator.createConfusionMatrix([t for d,t in authorTagTraining], predictionsTesting, truthsTesting)
    print("Our RMS Error:", cRMS)
    print("Accuracy improvement over baseline:", cAcc - bAcc)
    print("RMS Error reduction from baseline:", bRMS - cRMS)
예제 #35
0
def main(dataPath):
    PreProcess.csvPath=dataPath
    # PreProcess.ReadFilesToDataFrame()
    # PreProcess.SplitToPictureAndFacesAnswers()
    # BadParticipantsRemove.RemoveParticipantsNotReportPANAS()
    # PreProcess.GetTestRates()
    # PreProcess.SaveTests()
    # PreProcess.LoadTestsWithScores()

    # PreProcess.GetTestMovie()
    # PreProcess.addUserInfoToTest()
    # PreProcess.SaveTests()
    # PreProcess.LoadTestsWithScores()
    PreProcess.LoadTestsWithScores()

    Graphs.ShowFalsePositiveGraphs(PreProcess.TestsWithScores.loc[PreProcess.TestsWithScores['video']=='calm'])
    # Graphs.ShowDifferenceBetweenTests(PreProcess.TestsWithScores)

    # firstCalm=PreProcess.TestsWithScores.loc[PreProcess.TestsWithScores['userId']==1]
    Graphs.GraphByQuestionType(PreProcess.TestsWithScores)

    Graphs.CalmMinusPositiveGraph(PreProcess.TestsWithScores)
예제 #36
0
def compile_string_as_string(program, debug=0, opt_vec=0, sim_end_time_fs=100000, top_module=""):
    """ This is a helper function """

    preProcess = PreProcess()
    preProcess.load_source_from_string(program)
    preProcess.preprocess_text()  # comments and includes and defines and undefs

    if debug:
        preProcess.print_text()

    data = "".join(preProcess.text)
    parser = new_Verilog_EBNF_parser()
    try:
        parsed_data = parser.parseString(data, True)

    except ParseException, err:
        print "err.line is ", err.line
        print "col is ", err.column
        text_lines = err.line.split(";")
        line_num = 0
        char_count = 0
        last_line = None
        print_next_line = False
        for line in text_lines:
            line += ";"
            line_num += 1
            if print_next_line:
                print "[%3d] %s" % (line_num, line)
                break
            if (char_count + len(line)) >= err.column:
                if last_line:
                    print "[%3d] %s" % (line_num - 1, last_line)
                print "[%3d] %s" % (line_num, line)
                print "      " + " " * (err.column - char_count - 1) + "^"
                print_next_line = True
            else:
                last_line = line
                char_count += len(line)

        print err
        return None
예제 #37
0
    float(line.rstrip('\n')) for line in open('csvData/train_label.data')
]

rawData = open('csvData/test.data', 'rb')
temp = np.loadtxt(rawData, delimiter=',')
testset = np.c_[np.ones(len(temp)), temp]

test_labels = [
    float(line.rstrip('\n')) for line in open('csvData/test_label.data')
]

###

# pre-process

PP = PreProcess.PreProcess(data, n_buckets=10,
                           func='boolean')  #,swap_labels=True)
data = PP.fit(data)
testset = PP.fit(testset)

data_labels = PP.processLabels(data_labels)
test_labels = PP.processLabels(test_labels)

# cross-validation

best_C = 2
best_ro = 0.01
best_accuracy = 0
best_epoch = 10
best_g0 = 1.001
''' 
for C in [4,2,0.5,0.25,0.125]:#,0.0625,0.03125]:
    if file.endswith(".csv"):
        filename = (file).split('.')
        #Split.callSplit(inputdir,filename[0])
        #print file
        #headers = LabelEncoding.convertlabels(trainingdir+'/'+file,outputdir)

for file in os.listdir(trainingdir):
    if file.endswith(".csv"):
        #print file
    	headers = LabelEncoding.convertlabels(trainingdir+'/'+file,outputdir)

labelledFilesPath = outputdir+'/labelled'
#Using one hot encoder
for file in os.listdir(labelledFilesPath):
    if file.endswith(".csv"):
    	PreProcess.convertallAttributes(labelledFilesPath+'/'+file,outputdir)

preprocessedFilesPath = outputdir+'/preprocessed'

outputfile=open(outputdir+'/ClassifiersResults.txt','a')

truelabelPath = inputdir+'/truelabels'
for file in os.listdir(labelledFilesPath):
    if file.endswith(".csv"):
    	
        filename = (file).split('.')
        #trainSet,trainSetTrueLabel = EvaluatingClassifiers.callClassifiers(preprocessedFilesPath+'/'+file,truelabelPath+'/'+file,filename[0],outputdir,inputdir,filename[1])
        trainSet,trainSetTrueLabel = EvaluatingClassifiers.callClassifiers(preprocessedFilesPath+'/'+file,truelabelPath+'/'+file,filename[0],outputdir)
        
        #LearningCurve.plotLearningCurve(trainSet, trainSetTrueLabel,outputdir+'/LearningCurve',filename[0])