def triple_classifier(tweet):
    '''
    输出结果:
    0 中性
    1 积极
    2 生气
    3 焦虑
    4 悲伤
    5 厌恶
    6 消极其他
    '''
    sentiment = 0
    if isinstance(tweet['text'],unicode):
        text = tweet['text']
    else:
        text = tweet['text'].decode('utf-8')
    keywords_list = []

    emoticon_sentiment = emoticon(text.encode('utf-8'))
    if emoticon_sentiment != MIDDLE:
        entries = cut(cut_str, text.encode('utf-8'))
        entry = [e.decode('utf-8', 'ignore') for e in entries]
        keywords_list = entry
        if emoticon_sentiment == POSITIVE:
            sentiment = emoticon_sentiment
            text = u''
        else:
            sentiment = flow_psychology_classfiy(text.encode('utf-8'))
            if sentiment == 0:
                    sentiment = 6
            text = u''
    
    if text != u'':
        entries = cut(cut_str, text.encode('utf-8'))
        entry = [e.decode('utf-8', 'ignore') for e in entries]
        keywords_list = entry
        
        
        bow = dictionary_1.doc2bow(entry)
        s = [1, 1]
        for pair in bow:
            s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1])
            s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1])
        if s[0] <= s[1]:
            bow = dictionary_2.doc2bow(entry)
            s2 = [1, 1]
            for pair in bow:
                s2[0] = s2[0] * (step2_score[pair[0]][0] ** pair[1])
                s2[1] = s2[1] * (step2_score[pair[0]][1] ** pair[1])
            if s2[0] > s2[1]:
                sentiment = POSITIVE
            else:
                sentiment = flow_psychology_classfiy(text.encode('utf-8'))
                if sentiment == 0:
                    sentiment = 6
        else:
            sentiment = MIDDLE        

    return sentiment
def triple_classifier(tweet):
    '''
    输出结果:
    0 中性
    1 积极
    2 生气
    3 焦虑
    4 悲伤
    5 厌恶
    6 消极其他
    '''
    sentiment = 0
    if isinstance(tweet['text'],unicode):
        text = tweet['text']
    else:
        text = tweet['text'].decode('utf-8')
    keywords_list = []

    emoticon_sentiment = emoticon(text.encode('utf-8'))
    if emoticon_sentiment != MIDDLE:
        entries = cut(cut_str, text.encode('utf-8'))
        entry = [e.decode('utf-8', 'ignore') for e in entries]
        keywords_list = entry
        if emoticon_sentiment == POSITIVE:
            sentiment = emoticon_sentiment
            text = u''
        else:
            sentiment = flow_psychology_classfiy(text.encode('utf-8'))
            if sentiment == 0:
                    sentiment = 6
            text = u''
    
    if text != u'':
        entries = cut(cut_str, text.encode('utf-8'))
        entry = [e.decode('utf-8', 'ignore') for e in entries]
        keywords_list = entry
        
        
        bow = dictionary_1.doc2bow(entry)
        s = [1, 1]
        for pair in bow:
            s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1])
            s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1])
        if s[0] <= s[1]:
            bow = dictionary_2.doc2bow(entry)
            s2 = [1, 1]
            for pair in bow:
                s2[0] = s2[0] * (step2_score[pair[0]][0] ** pair[1])
                s2[1] = s2[1] * (step2_score[pair[0]][1] ** pair[1])
            if s2[0] > s2[1]:
                sentiment = POSITIVE
            else:
                sentiment = flow_psychology_classfiy(text.encode('utf-8'))
                if sentiment == 0:
                    sentiment = 6
        else:
            sentiment = MIDDLE        

    return sentiment
Example #3
0
def make_audio(location, name, d_csv, start_idx, end_idx):
    for i in range(start_idx,end_idx):
        f_name = name + str(i)
        link = "https://www.youtube.com/watch?v="+d_csv.loc[i][0]
        start_time = d_csv.loc[i][1]
        end_time = start_time+3.0
        utils.download(location,f_name,link)
        utils.cut(location,f_name,start_time,end_time)
        print("\r Process audio... ".format(i) + str(i), end="")
    print("\r Finish !!", end="")
Example #4
0
 def cut_text(item):
     text = item['text'].encode('utf-8')
     terms_cx = cut(s, text, cx=True)
     terms = [term for term, cx in terms_cx]
     item['terms'] = terms
     item['terms_cx'] = terms_cx
     return item
    def createContainer(self, result):
        time = '00:' + result['duration']
        video_codec = result['video_codec']

        input_path = Path.cwd()
        input_file = str(input_path / "BBB.mp4")
        subtitles_file = str(input_path / "subtitles.srt")

        output_path = Path.cwd() / "Containers" / result['name']
        output_path.mkdir(parents=True, exist_ok=True)
        output_path_container = Path.cwd() / "Containers"
        output_container = str(output_path_container) + \
            "/" + result['name'] + ".mp4"

        output_cut = cut(input_file, output_path, time)
        output_resize = resize(output_cut, output_path, result['size'])
        output_mono = mono(output_cut, output_path, result['audio_codec'])
        output_subtitles = subtitles(subtitles_file, output_path)
        output_lower_bitrate = lower_bitrate(
            output_cut, output_path, result['bitrate'])

        command = f'ffmpeg -i {output_resize} -i {output_mono} -i {output_lower_bitrate} -map 0:v -map 1:a -map 2:a -c:a copy -c:v {video_codec} -vf "ass={output_subtitles}" {output_container}'
        os.system(command)
        os.system(f"ffplay {output_container}")

        return output_container
Example #6
0
    def transform(self):
        """ Bestow weights on candidate by derivation """
        formulas, arguments, temp = self.fit()

        print("Get the candidate by derivation")
        for mark, formula in formulas:
            feature = self.features[mark]

            # Derivation
            x = utils.get(arguments, mark)
            y = utils.get(temp, mark)

            y_hat = formula.predict(x)

            sections = utils.growth(list(y_hat))

            candidates, phrase = [], ""

            for section in sections:
                for index in section:
                    phrase += feature[str(index)]['value']
                candidates.append(phrase)
                phrase = ""

            candidates = "".join(utils.cut(candidates, self._constants))

            for index, sentence in feature.items():
                if sentence["value"] in candidates:
                    sentence["deriv"] = 1.2 if sentence["policy"] > 0 else 0.2

            self.features[mark].update(feature)
        print("done")
Example #7
0
def load_test():
    ## load word vector
    with utils.timer('Load word vector'):
        word2vec = tl.files.load_npy_to_any(name='%s/word2vec/w2v_sgns_%s_%s_%s.npy' % (
        config.ModelOutputDir, config.embedding_size, config.corpus_version, datestr))
    ## load train data
    with utils.timer('Load test data'):
        test_data, uid_list, info_id_list = utils.load_test_data(test_file)
        test_data, uid_list, info_id_list  = test_data[:int(0.2 * len(test_data))], uid_list[:int(0.2 * len(uid_list))], info_id_list[:int(0.2 * len(info_id_list))]
    with utils.timer('representation for test'):
        X_test = []
        text_test = []
        for i in range(len(test_data)):
            text = test_data[i]
            if(text == ''):
                continue
            words = utils.cut(text)
            if(len(words) == 0):
                continue
            X_test.append([word2vec.get(w, word2vec['_UNK']) for w in words])
            text_test.append(text)
    del word2vec
    gc.collect()

    return X_test, text_test, uid_list, info_id_list
Example #8
0
 def cut_text(item):
     text = item['text'].encode('utf-8')
     # terms_cx = cut(s, text, cx=True)
     # terms = [term for term, cx in terms_cx]
     item['terms'] = cut(s, text, cx=False)
     # item['terms_cx'] = terms_cx
     return item
Example #9
0
def pathway_from_files(name, dirname):
    path = pathway(name)
    try:
        for key, suffix in pathway.components.items():
            f = open('%s.%s' % (join(dirname,name),suffix))
            path[key] = cut(f)
    except:
        raise IOError, 'path error'
    return path
Example #10
0
def pathway_from_files(name, dirname):
    path = pathway(name)
    try:
        for key, suffix in pathway.components.items():
            f = open('%s.%s' % (join(dirname, name), suffix))
            path[key] = cut(f)
    except:
        raise IOError, 'path error'
    return path
Example #11
0
def make_network(topic, date, window_size, max_size=100000, ts=False):
    end_time = datetime2ts(date)
    start_time = end_time - window2time(window_size)

    g = nx.DiGraph()

    #need repost index
    topic = cut(s, topic.encode('utf-8'))
    statuses_search = XapianSearch(path='/opt/xapian_weibo/data/', name='master_timeline_weibo', schema_version=2)
    query_dict = {'text': topic, 'timestamp': {'$gt': start_time, '$lt': end_time}}

    if ts:
        count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'timestamp', 'retweeted_status'], max_offset=max_size)
    else:
        count, get_statuses_results = statuses_search.search(query=query_dict, field=['text', 'user', 'retweeted_status'], max_offset=max_size)
    print 'topic statuses count %s' % count

    if ts:
        uid_ts = {}
        for status in get_statuses_results():
            try:
                if status['retweeted_status']:
                    repost_uid = status['user']
                    rt_mid = status['retweeted_status']
                    repost_ts = int(status['timestamp'])
                    source_status = acquire_status_by_id(rt_mid)
                    source_uid = source_status['user']
                    source_ts = int(source_status['timestamp'])
                    if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid):
                        continue
                    if repost_uid not in uid_ts:
                        uid_ts[repost_uid] = repost_ts
                    else:
                        if uid_ts[repost_uid] > repost_ts:
                            uid_ts[repost_uid] = repost_ts
                    if source_uid not in uid_ts:
                        uid_ts[source_uid] = source_ts   
                    else:
                        if uid_ts[source_uid] > source_ts:
                            uid_ts[source_uid] = source_ts
                    g.add_edge(repost_uid, source_uid)
            except (TypeError, KeyError):
                continue
        return uid_ts, g
    else:
        for status in get_statuses_results():
            try:
                if status['retweeted_status']:
                    repost_uid = status['user']
                    rt_mid = status['retweeted_status']
                    source_uid = acquire_status_by_id(rt_mid)['user']
                    if is_in_trash_list(repost_uid) or is_in_trash_list(source_uid):
                        continue
                    g.add_edge(repost_uid, source_uid)
            except (TypeError, KeyError):
                continue
        return g
Example #12
0
def text_tensor(text):
    words = [w for w in utils.cut(text)]
    print(words)
    if(len(words) < 150):
        words = ['_UNK'] * (150 - len(words)) + words
    else:
        words = words[:150]
    words = [word2vec.get(w, word2vec['_UNK']) for w in words]
    words = np.asarray(words)

    sample = words.reshape(1, len(words), config.embedding_size)
    return sample
Example #13
0
def compress_districts(unit,remove_voterfile=False):
    print 'running compress_districts'
    from config import locality_name,precinct_name,voterfile_delimiter,reduced_voterfile_name
    import csv
    from utils import cut
    from zipfile import ZipFile
    district_names = dict((e['name_column'],set()) for e in unit.ed_defs)
    print 'dist_names: {}'.format(district_names)
    district_names.update({locality_name:set(),precinct_name:set()})
    vf_columns = set(['voterbase_id']+[column for column_tuple in district_names for column in column_tuple if type(column_tuple) == tuple] + [column for column in district_names if type(column) == str])
    zfile = ZipFile(unit.UNCOMPRESSED_VOTER_FILE_ZIP_LOCATION)
    voter_file_name = zfile.namelist()[0]
    voter_file_full = os.path.join(unit.__path__[0],voter_file_name)
    if not os.path.exists(voter_file_full):
        zfile.extract(voter_file_name,unit.__path__[0])
    column_indexes = dict((column,idx) for idx,column in enumerate(open(voter_file_full).readline().split(voterfile_delimiter)) if column in vf_columns)

    extra_district_dicts = {}
    for k,v in (getattr(unit,'EXTRA_DISTRICTS',None) or {}).iteritems():
        edfile = os.path.join(unit.__path__[0],v['filename'])
        edcsv = csv.reader(open(edfile),delimiter='\t')
        edcsv.next()
        extra_district_dicts[k]=dict((l[0],l[v['column']-1]) for l in edcsv)

    with open(os.path.join(unit.__path__[0],reduced_voterfile_name),'w') as reduced_voterfile:
        print 'reduced_vf: {}'.format(reduced_voterfile)
        reduced_voterfile_csv = csv.DictWriter(reduced_voterfile,fieldnames=column_indexes.keys() + extra_district_dicts.keys(),delimiter=voterfile_delimiter)
        reduced_voterfile_csv.writeheader()
        for i,line in enumerate(csv.DictReader(cut(voter_file_full,sorted(column_indexes.values()),voterfile_delimiter),delimiter=voterfile_delimiter)):
            write_line = False
            for edn,edd in extra_district_dicts.iteritems():
                line.update({edn,edd[line['voterbase_id']]})
            for name_columns,district_set in district_names.iteritems():
                if type(name_columns) == tuple:
                    name = tuple(line[nc] for nc in name_columns)
                else:
                    name = line[name_columns]
                if name not in district_set:
                    write_line = True
                    district_set.add(name)
            if write_line:
                reduced_voterfile_csv.writerow(line)
            if i % 100000 == 0:
                print i
    if remove_voterfile:
        os.remove(voter_file_full)
    with open(os.path.join(unit.__path__[0],'districts.py'),'w') as districts_file:
        districts_file.write('state="{state}"\n'.format(state=unit.state_key))
        for ed in unit.ed_defs:
            districts_file.write(ed['district_type']+'='+str(district_names[ed['name_column']])+'\n')
    unit.unit_post_district_trigger()
Example #14
0
    def normal_tokenize(self, caption):
        # Convert caption (string) to word ids.
        tokens = self.tokenizer.word_tokenize(str(caption).lower())

        if self.filter:
            tokens = filter_freq(tokens, self.count, self.n_filter)

        if self.cut:
            tokens = cut(tokens, self.n_cut)
        caption = []
        caption.append(self.vocab('<start>'))
        caption.extend([self.vocab(token) for token in tokens])
        caption.append(self.vocab('<end>'))
        target = torch.Tensor(caption)
        return target
Example #15
0
def preprocessing(qlist):
    """
    对于qlist做文本预处理操作。 可以考虑以下几种操作:
       停用词过滤、转换成lower_case、去掉一些无用的符号、去掉出现频率很低的词:比如出现次数少于10,20....
       对于数字的处理:分词完有些单词可能就是数字,把他们转成"#number"
       stemming(利用porter stemming)
    """
    qlist_data = utils.load_qlist('data/q_prepro.txt')
    stopset = set(stopwords.words('english'))
    minus_words = ["when", "what", "where", "how", "which", "who", "whom"]
    for i in minus_words:
        stopset.discard(i)
    p = PorterStemmer()
    qlist_ = utils.cut(qlist)
    word_dic = Counter([q for l in utils.cut(qlist_data) for q in l])
    low_freq_words = utils.find_low_freq_word(word_dic)
    new_list = []
#    f = open('data/q_prepro.txt', 'w', encoding='utf-8') 
    for line in qlist_:
        l = ""
        for word in line:
            word = word.lower()
            #stemming
            word = p.stem(word)
            # 去除所有标点符号
            word = ''.join(c for c in word if c not in string.punctuation)
            #数字转成”#number“
            if word.isdigit():
                word = "#number"
            #不是停用词且不是低频词,加入新list
            if word not in low_freq_words and word not in stopset:
                l += word + " "
#        f.write(l +'\n')
        new_list.append(l)
#    f.close()
    return new_list
Example #16
0
def compress_districts(unit,remove_voterfile=False):
    from config import locality_name,precinct_name,voterfile_delimiter,reduced_voterfile_name
    import csv
    from utils import cut
    from zipfile import ZipFile
    district_names = dict((e['name_column'],set()) for e in unit.ed_defs)
    district_names.update({locality_name:set(),precinct_name:set()})
    vf_columns = set(['voterbase_id']+[column for column_tuple in district_names for column in column_tuple if type(column_tuple) == tuple] + [column for column in district_names if type(column) == str])
    zfile = ZipFile(unit.UNCOMPRESSED_VOTER_FILE_ZIP_LOCATION)
    voter_file_name = zfile.namelist()[0]
    voter_file_full = os.path.join(unit.__path__[0],voter_file_name)
    if not os.path.exists(voter_file_full):
        zfile.extract(voter_file_name,unit.__path__[0])
    column_indexes = dict((column,idx) for idx,column in enumerate(open(voter_file_full).readline().split(voterfile_delimiter)) if column in vf_columns)

    extra_district_dicts = {}
    for k,v in (getattr(unit,'EXTRA_DISTRICTS',None) or {}).iteritems():
        edfile = os.path.join(unit.__path__[0],v['filename'])
        edcsv = csv.reader(open(edfile),delimiter='\t')
        edcsv.next()
        extra_district_dicts[k]=dict((l[0],l[v['column']-1]) for l in edcsv)

    with open(os.path.join(unit.__path__[0],reduced_voterfile_name),'w') as reduced_voterfile:
        reduced_voterfile_csv = csv.DictWriter(reduced_voterfile,fieldnames=column_indexes.keys() + extra_district_dicts.keys(),delimiter=voterfile_delimiter)
        reduced_voterfile_csv.writeheader()
        for i,line in enumerate(csv.DictReader(cut(voter_file_full,sorted(column_indexes.values()),voterfile_delimiter),delimiter=voterfile_delimiter)):
            write_line = False
            for edn,edd in extra_district_dicts.iteritems():
                line.update({edn,edd[line['voterbase_id']]})
            for name_columns,district_set in district_names.iteritems():
                if type(name_columns) == tuple:
                    name = tuple(line[nc] for nc in name_columns)
                else:
                    name = line[name_columns]
                if name not in district_set:
                    write_line = True
                    district_set.add(name)
            if write_line:
                reduced_voterfile_csv.writerow(line)
            if i % 100000 == 0:
                print i
    if remove_voterfile:
        os.remove(voter_file_full)
    with open(os.path.join(unit.__path__[0],'districts.py'),'w') as districts_file:
        districts_file.write('state="{state}"\n'.format(state=unit.state_key))
        for ed in unit.ed_defs:
            districts_file.write(ed['district_type']+'='+str(district_names[ed['name_column']])+'\n')
    unit.unit_post_district_trigger()
Example #17
0
def to_handmodel(hand_crop, direction='up'):
    handcontour = utils.skindetect(hand_crop)
    hand = utils.cut(hand_crop, handcontour)
    handskeleton = utils.skeletonize(hand)
    fingerlines = utils.linedetect(handskeleton)
    if direction == 'dn':
        handmodel = map(lambda l: [l[2], l[3]], fingerlines)
    else:
        handmodel = map(lambda l: [l[0], l[1]], fingerlines)
    if sum([1 for _ in handmodel]) > 4:
        handmodel = utils.cluster(handmodel, \
                value=lambda p: (p[0])**2 + (p[1])**2, \
                K=4)
        combine=lambda p1, p2: [(p1[0]+p2[0])/2, (p1[1]+p2[1])/2]
        handmodel = map(lambda l: reduce(combine, l), handmodel)
    return handmodel
Example #18
0
def to_handmodel(hand_crop, direction='up'):
    handcontour = utils.skindetect(hand_crop)
    hand = utils.cut(hand_crop, handcontour)
    handskeleton = utils.skeletonize(hand)
    fingerlines = utils.linedetect(handskeleton)
    if direction == 'dn':
        handmodel = map(lambda l: [l[2], l[3]], fingerlines)
    else:
        handmodel = map(lambda l: [l[0], l[1]], fingerlines)
    if sum([1 for _ in handmodel]) > 4:
        handmodel = utils.cluster(handmodel, \
                value=lambda p: (p[0])**2 + (p[1])**2, \
                K=4)
        combine = lambda p1, p2: [(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2]
        handmodel = map(lambda l: reduce(combine, l), handmodel)
    return handmodel
Example #19
0
def _index_field(field, document, item, schema_version, schema, termgen):
    prefix = DOCUMENT_CUSTOM_TERM_PREFIX + field['field_name'].upper()
    field_name = field['field_name']
    # 可选term在pre_func里处理
    if field_name in schema['index_item_iter_keys']:
        term = _marshal_term(item.get(field_name), schema.get('pre_func', {}).get(field_name))
        document.add_term(prefix + term)
    # 可选value在pre_func里处理
    elif field_name in schema['index_value_iter_keys']:
        value = _marshal_value(item.get(field_name), schema.get('pre_func', {}).get(field_name))
        document.add_value(field['column'], value)
    elif field_name == 'text':
        text = item['text'].encode('utf-8')
        tokens = cut(s, text)
        termgen.set_document(document)
        termgen.index_text_without_positions(' '.join(tokens), 1, prefix)
Example #20
0
def put_on(img, el, condition, bg_color=1):
    # Center of the structural element
    x_c = el.size[0] // 2
    y_c = el.size[1] // 2

    # Additional emptiness to add to img along x and y
    dx_min = x_c
    dx_max = el.size[0] - x_c - 1
    dy_min = y_c
    dy_max = el.size[1] - y_c - 1


    element_pixels = Image2ll_binary(el)
    image_pixels = expand(Image2ll_binary(img),
        dx_min, dx_max, dy_min, dy_max, bg_color)
    new_image_pixels = [[
        1 for idx_y in range(img.size[1] + dy_min + dy_max)]
            for idx_x in range(img.size[0] + dx_min + dx_max)]

    black_count = 0
    for idx_x in range(dx_min, dx_min + img.size[0]):
        for idx_y in range(dy_min, dy_min + img.size[1]):
            if condition == 'any' and image_pixels[idx_x][idx_y] == 0:
                for dx in range(-dx_min, dx_max + 1):
                    for dy in range(-dy_min, dy_max + 1):
                        if element_pixels[x_c + dx][y_c + dy] == 0:
                            if new_image_pixels[idx_x + dx][idx_y + dy] != 0:
                                black_count += 1
                            new_image_pixels[idx_x + dx][idx_y + dy] = 0
            elif condition == 'all':
                if image_pixels[idx_x][idx_y] == 0:
                    all_match = True
                    for dx in range(-dx_min, dx_max + 1):
                        for dy in range(-dy_min, dy_max + 1):
                            if (image_pixels[idx_x + dx][idx_y + dy] != 0
                                and element_pixels[x_c + dx][y_c + dy] == 0):
                                    all_match = False
                                    break
                        if not all_match:
                            break
                    if all_match:
                        new_image_pixels[idx_x][idx_y] = 0
                        black_count += 1
    print('black:', black_count,
          'white:', img.size[0] * img.size[1] - black_count)
    return ll2Image_binary(cut(new_image_pixels, dx_min, dx_max, dy_min, dy_max))
Example #21
0
def embed_watermark(host, watermark, robustness):
    old_shape = host.shape[0: 2]
    host = utils.pad(host)
    height, width = host.shape[0: 2]
    host = cv.cvtColor(host, cv.COLOR_BGR2YCrCb)
    luma = host[:, :, 0]
    luma = luma.astype(np.float)
    watermark = cv.resize(watermark, (width >> 1, height >> 2))
    watermark = cv.adaptiveThreshold(watermark, 255, cv.ADAPTIVE_THRESH_MEAN_C, cv.THRESH_BINARY, 11, 2)
    watermark, pxl_perm_mat = utils.pseudo_random_permute(watermark)
    luma = utils.dct(luma)
    luma = embed_freq_domain(luma, watermark, robustness)
    luma = utils.idct(luma)
    host[:, :, 0] = np.clip(luma, 0, 255)
    host = cv.cvtColor(host, cv.COLOR_YCrCb2BGR)
    host = utils.cut(host, old_shape)
    return host, pxl_perm_mat
Example #22
0
def load_dataset(test_size=0.2):
    ## load word vector
    with utils.timer('Load word vector'):
        word2vec = tl.files.load_npy_to_any(name='%s/word2vec/w2v_sgns_%s_%s_%s.npy' % (config.ModelOutputDir, config.embedding_size, config.corpus_version, datestr))
    ## load train data
    with utils.timer('Load train data'):
        data_1 = utils.load_cs_deleted_data(cs_delete_file_1)
        print('target ratio: ')
        print(data_1['label'].value_counts())
        data_2 = utils.load_58_data(pos_58_file)
        print(data_2['label'].value_counts())
        data_3 = utils.load_58_data(neg_58_file)
        print(data_3['label'].value_counts())
        data_4 = utils.load_cs_deleted_data(cs_delete_file_2)
        print(data_4['label'].value_counts())
        data = pd.concat([data_1, data_2, data_3, data_4[data_4['label'] == 1].reset_index(drop=True)], axis=0,ignore_index=True)
        #data = pd.concat([data_1, data_2, data_3, data_4], axis=0,ignore_index=True)
        DebugDir = '%s/debug' % config.DataBaseDir
        if (os.path.exists(DebugDir) == False):
            os.makedirs(DebugDir)
        del data_4, data_3, data_2, data_1
        gc.collect()
    ## data representation
    with utils.timer('representation for train'):
        # X = [[word2vec.get(w, word2vec['_UNK']) for w in utils.cut(text)] for text in data['text'].values]
        X = []
        y = []
        for i in range(len(data)):
            text = data['text'][i]
            if(text == ''):
                continue
            words = utils.cut(text)
            if(len(words) == 0):
                continue
            X.append([word2vec.get(w, word2vec['_UNK']) for w in words])
            y.append(data['label'][i])

    del word2vec, data
    gc.collect()

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size)

    return X_train, y_train, X_valid, y_valid
Example #23
0
def WordRepresentation(texts, word2vec):
    ''''''
    with utils.timer('representation'):
        ## padding
        X = []
        indexes = []
        for i in range(len(texts)):
            text = texts[i]
            words = utils.cut(text)
            if (len(words) == 0):
                continue
            if (len(words) < maxlen):
                X.append(['_UNK'] * (maxlen - len(words)) +
                         words)  # ahead padding in default mode
            else:
                X.append(words[:maxlen])
            indexes.append(i)
        ## word2vec
        X = np.array([[word2vec.get(w, word2vec['_UNK']) for w in wv]
                      for wv in X])
    return X, indexes
Example #24
0
def test():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s : " +
        "%(module)s (%(lineno)s) - %(levelname)s - %(message)s")

    data = [((f[0], f[1]), float(f[2]))
            for f in [line.strip().split("|||") for line in open(sys.argv[1])]]

    print "sample data:", data[:3]

    train_data, devel_data, test_data = cut(data)

    logging.info('loading model...')
    glove_embedding = GloveEmbedding(sys.argv[2])
    logging.info('done!')
    dim = int(sys.argv[3])
    X_train = featurize(train_data, glove_embedding, dim)

    Y_train = np.array([e[1] for e in train_data])

    logging.info("Input shape: {0}".format(X_train.shape))
    print X_train[:3]
    logging.info("Label shape: {0}".format(Y_train.shape))
    print Y_train[:3]

    input_dim = X_train.shape[1]
    output_dim = 1
    model = create_model(input_dim, output_dim)
    model.fit(X_train, Y_train, nb_epoch=int(sys.argv[4]), batch_size=32)

    X_devel = featurize(devel_data, glove_embedding, dim)
    Y_devel = np.array([e[1] for e in devel_data])

    pred = model.predict_proba(X_devel, batch_size=32)
    corr = spearmanr(pred, Y_devel)
    print "Spearman's R: {0}".format(corr)
Example #25
0
def _index_field(field, document, item, schema_version, schema):
    prefix = DOCUMENT_CUSTOM_TERM_PREFIX + field['field_name'].upper()
    if schema_version == 2:
        # 可选term存为0
        if field['field_name'] in ['retweeted_status']:
            term = _marshal_term(item[field['field_name']], schema['pre'][field['field_name']]) if field['field_name'] in item else '0'
            document.add_term(prefix + term)
        # 必选term
        elif field['field_name'] in ['user']:
            term = _marshal_term(item[field['field_name']], schema['pre'][field['field_name']])
            document.add_term(prefix + term)
        # value
        elif field['field_name'] in ['_id', 'timestamp', 'reposts_count', 'comments_count', 'attitudes_count']:
            document.add_value(field['column'], _marshal_value(item[field['field_name']]))
        elif field['field_name'] == 'text':
            tokens = cut(s, item[field['field_name']].encode('utf-8'))
            termgen = xapian.TermGenerator()
            termgen.set_document(document)
            termgen.index_text_without_positions(' '.join(tokens), 1, prefix)
            """
            for token, count in Counter(tokens).iteritems():
                document.add_term(prefix + token, count)
            """

    elif schema_version == 1:
        # 必选term
        if field['field_name'] in ['name', 'location', 'province']:
            term = _marshal_term(item[field['field_name']])
            document.add_term(prefix + term)
        # 可选value
        elif field['field_name'] in ['created_at']:
            value = _marshal_value(item[field['field_name']], schema['pre'][field['field_name']]) if field['field_name'] in item else '0'
            document.add_value(field['column'], value)
        # 必选value
        elif field['field_name'] in ['_id', 'followers_count', 'statuses_count', 'friends_count', 'bi_followers_count']:
            document.add_value(field['column'], _marshal_value(item[field['field_name']]))
def detect_fissures(img):
    pimg = np.array(img)
    limiar = utils.define_threshold(img)
    if limiar > 0 and limiar < 0.05:
        offset = 10
    elif limiar > 0.05:
        offset = 25

    pimg = utils.define_skeleton(img, 45, offset)
    if limiar > 0.05:
        pimg = utils.filtragem(pimg)
    runs = 30
    lines = []
    for _ in range(runs):
        lines.extend(transform.probabilistic_hough_line(pimg))

    directions = np.array(
        [utils.angle(point1, point2) for point1, point2 in lines])
    directions = np.where(directions < 0, directions + 180, directions)
    hist = np.histogram(directions, range=[0, 180], bins=180)
    sort_indexes = np.argsort(hist[0])
    hist = hist[0][sort_indexes], hist[1][sort_indexes]

    a1, a2 = hist[1][-2:]

    rot_pimg1 = skimage.transform.rotate(pimg, a1 - 90, resize=True)
    rot_pimg2 = skimage.transform.rotate(pimg, a2 - 90, resize=True)

    width = 3
    kernel1 = np.pad(np.ones([rot_pimg1.shape[0], width]),
                     1,
                     mode='constant',
                     constant_values=-1)
    kernel2 = np.pad(np.ones([rot_pimg2.shape[0], width]),
                     1,
                     mode='constant',
                     constant_values=-1)
    corr1 = sp.ndimage.correlate(rot_pimg1, kernel1, mode='constant')
    corr2 = sp.ndimage.correlate(rot_pimg2, kernel2, mode='constant')

    corr_rot1 = utils.cut(
        skimage.transform.rotate(corr1, 90 - a1, resize=True), pimg.shape)
    corr_rot2 = utils.cut(
        skimage.transform.rotate(corr2, 90 - a2, resize=True), pimg.shape)

    thresholds = [
        filters.threshold_isodata, filters.threshold_li,
        filters.threshold_mean, filters.threshold_minimum,
        filters.threshold_otsu, filters.threshold_triangle,
        filters.threshold_yen
    ]

    best_fitness = -np.inf
    best_mask = None
    for thr in thresholds:
        binary1 = np.where(corr_rot1 > thr(corr1), 1, 0)
        binary2 = np.where(corr_rot2 > thr(corr2), 1, 0)
        selem = np.ones((5, 5))
        binary_dilated1 = skimage.morphology.dilation(binary1, selem=selem)
        binary_dilated2 = skimage.morphology.dilation(binary2, selem=selem)
        mask = np.logical_or(binary_dilated1, binary_dilated2)
        fitness_value = fitness(mask, pimg)
        if fitness_value > best_fitness:
            best_fitness = fitness_value
            best_mask = mask
    mask = best_mask

    rachaduras = (1 - mask) * pimg
    # rachaduras_rgba = np.where(rachaduras[..., np.newaxis] == 1, [255,0,0,255], [0,0,0,0])
    return rachaduras
def triple_classifier(tweet):
    '''
    输出结果:
    0 中性
    1 积极
    2 生气
    3 焦虑
    4 悲伤
    5 厌恶
    6 消极其他
    '''
    sentiment = 0
    # text = tweet['text']  # encode
    text = tweet['text_ch']
    keywords_list = []
    try:
        emoticon_sentiment = emoticon(text)
        if emoticon_sentiment != 0:
            entries = cut(cut_str, text.encode('utf-8'))
            entry = [e.decode('utf-8', 'ignore') for e in entries]
            keywords_list = entry
            if emoticon_sentiment == HAPPY:
                sentiment = emoticon_sentiment
                text = u''
            else:
                sentiment = flow_psychology_classfiy(text)
                if sentiment == 0:
                    sentiment = emoticon_sentiment
                text = u''

        if text != u'':
            entries = cut(cut_str, text.encode('utf-8'))
            entry = [e.decode('utf-8', 'ignore') for e in entries]
            keywords_list = entry

            bow = dictionary_1.doc2bow(entry)
            s = [1, 1]
            for pair in bow:
                s[0] = s[0] * (step1_score[pair[0]][0]**pair[1])
                s[1] = s[1] * (step1_score[pair[0]][1]**pair[1])
            if s[0] <= s[1]:
                bow = dictionary_2.doc2bow(entry)
                s = [1, 1, 1]
                for pair in bow:
                    s[0] = s[0] * (step2_score[pair[0]][0]**pair[1])
                    s[1] = s[1] * (step2_score[pair[0]][1]**pair[1])
                    s[2] = s[2] * (step2_score[pair[0]][2]**pair[1])
                if s[0] > s[1] and s[0] > s[2]:
                    sentiment = HAPPY
                else:
                    sentiment = flow_psychology_classfiy(text)
                    if sentiment == 0:
                        if s[1] > s[0] and s[1] > s[2]:
                            sentiment = SAD
                        elif s[2] > s[1] and s[2] > s[0]:
                            sentiment = ANGRY
                        else:
                            sentiment = 6
            else:
                sentiment = 0
    except:
        pass

    return sentiment, keywords_list
 def vlans(self):
     return [int(x) for x in utils.cut(utils.loads(self.vlan_list()))]
 def images(self):
     return utils.cut(utils.loads(self.image_list()))
Example #30
0
all_users = []
all_uids = []
all_groups = []
all_gids = []

for user in passwd_file:
    all_users.append(user[0])
    all_uids.append(int(user[1]))

for group in group_file:
    all_groups.append(group[0])
    all_gids.append(int(group[1]))

defaults = cat('defaults')
home = cut(grep(defaults, 'HOME'))
shell = cut(grep(defaults, 'SHELL'))
group = cut(grep(defaults, 'GROUP'))
skel = cut(grep(defaults, 'SKEL'))
del (defaults)

login_defs = cat('login.defs')
pass_max_days = cut(grep(login_defs, 'PASS_MAX_DAYS'))
pass_min_days = cut(grep(login_defs, 'PASS_MIN_DAYS'))
pass_warn_age = cut(grep(login_defs, 'PASS_WARN_AGE'))
uid_min = int(cut(grep(login_defs, 'UID_MIN')))
uid_max = int(cut(grep(login_defs, 'UID_MAX')))
gid_min = int(cut(grep(login_defs, 'GID_MIN')))
gid_max = int(cut(grep(login_defs, 'GID_MAX')))
is_home_needed = cut(grep(login_defs, 'CREATE_HOME'))
is_usegroups_enabled = cut(grep(login_defs, 'USERGROUPS_ENAB'))
Example #31
0
 def test_scws(self):
     sentence = u'中国好声音'
     s = load_scws()
     tokens = cut(s, sentence.encode('utf-8'))
     self.assertNotEqual(tokens, None, 'scws failed')
 if limiar > 0.05:
     width = 3
 
 #kernel1 = np.pad(np.ones([rot_pimg1.shape[0], width]), 1, mode='constant', constant_values=0)
 #kernel2 = np.pad(np.ones([rot_pimg2.shape[0], width]), 1, mode='constant', constant_values=0)
 
 kernel1 = np.ones([rot_pimg1.shape[0], width])
 kernel2 = np.ones([rot_pimg2.shape[0], width])
 
 corr1 = ndimage.correlate(rot_pimg1, kernel1, mode='constant')
 corr2 = ndimage.correlate(rot_pimg2, kernel2, mode='constant')
 print("Aplicação do filtro")
 plot_img_grid([[corr1, corr2]])
 plt.show()
 print("Imagem filtrada rotacionada a posição original")
 corr_rot1 = utils.cut(transform.rotate(corr1, 90-a1, resize=True), pimg.shape)
 corr_rot2 = utils.cut(transform.rotate(corr2, 90-a2, resize=True), pimg.shape)
 plot_img_grid([[corr_rot1, corr_rot2]])
 plt.show()
 
 binary1 = np.where(corr_rot1 > filters.threshold_yen(corr1), 1, 0)
 binary2 = np.where(corr_rot2 > filters.threshold_yen(corr2), 1, 0)
 
 print('Imagem filtrada binarizada')
 plot_img_grid([[binary1, binary2]])
 plt.show()
 print('Imagem filtrada binarizada dilatada')
 selem = np.array([[0,1,0],[1,1,1],[0,1,0]])
 binary_dilated1 = morphology.dilation(binary1, selem=selem)
 binary_dilated2 = morphology.dilation(binary2, selem=selem)
 plot_img_grid([[binary_dilated1, binary_dilated2]])
def triple_classifier(tweet):
    """
    输出结果:
    0 中性
    1 积极
    2 生气
    3 焦虑
    4 悲伤
    5 厌恶
    6 消极其他
    """
    sentiment = 0
    text = tweet["text"]  # encode
    keywords_list = []

    emoticon_sentiment = emoticon(text)
    if emoticon_sentiment != 0:
        entries = cut(cut_str, text)
        entry = [e.decode("utf-8", "ignore") for e in entries]
        keywords_list = entry
        if emoticon_sentiment == HAPPY:
            sentiment = emoticon_sentiment
            text = ""
        else:
            sentiment = flow_psychology_classfiy(text)
            if sentiment == 0:
                sentiment = emoticon_sentiment
            text = ""

    if text != "":
        entries = cut(cut_str, text)
        entry = [e.decode("utf-8", "ignore") for e in entries]
        keywords_list = entry

        bow = dictionary_1.doc2bow(entry)
        s = [1, 1]
        for pair in bow:
            s[0] = s[0] * (step1_score[pair[0]][0] ** pair[1])
            s[1] = s[1] * (step1_score[pair[0]][1] ** pair[1])
        if s[0] <= s[1]:
            bow = dictionary_2.doc2bow(entry)
            s = [1, 1, 1]
            for pair in bow:
                s[0] = s[0] * (step2_score[pair[0]][0] ** pair[1])
                s[1] = s[1] * (step2_score[pair[0]][1] ** pair[1])
                s[2] = s[2] * (step2_score[pair[0]][2] ** pair[1])
            if s[0] > s[1] and s[0] > s[2]:
                sentiment = HAPPY
            else:
                sentiment = flow_psychology_classfiy(text)
                if sentiment == 0:
                    if s[1] > s[0] and s[1] > s[2]:
                        sentiment = SAD
                    elif s[2] > s[1] and s[2] > s[0]:
                        sentiment = ANGRY
                    else:
                        sentiment = 6
        else:
            sentiment = 0

    return sentiment
    with open(stop_file, "r") as f:
        stop_list = f.readlines()
        stop_list = list(map(lambda x: x.rstrip("\n"), stop_list))
    with open("./data/adult_content.txt", "r") as f:
        lines = f.readlines()
        lines = list(map(lambda x: x.rstrip("\n"), lines))

        lines_n = []
        for line in lines:
            line = line.split("\t")

            line = " ".join(line[-4:])
            #  line = "".join(line.split())
            line = " ".join(line.split())
            line = " ".join(
                list(cut([line], use_stop=use_stop, stop_list=stop_list)))
            line += "\t1"
            lines_n.append(line)

    with open("./data/normal_content.txt", "r") as f:
        lines_p = f.readlines()
        lines_p = list(map(lambda x: x.rstrip("\n"), lines_p))
        lines_p = list(map(lambda x: " ".join(x.split()), lines_p))
        lines_p = cut(lines_p, use_stop=use_stop, stop_list=stop_list)
        lines_p = list(map(lambda x: x + "\t0", lines_p))

    lines_all = lines_n + lines_p

    random.shuffle(lines_all)

    with open("data/data.txt", "w") as f: