Example #1
0
def getBrokenlinks(url):
    broken_links = 0
    threads = []
    try:
        soup = url.getsoup()
    except WebcredError as e:
        raise WebcredError(e.message)
    except:
        raise WebcredError('Url is broken')

    for link in soup.find_all('a', href=True):
        uri = link.get('href')

        # TODO should it inlude inner links as well?
        if not uri.startswith('http://') and not uri.startswith('https://'):
            uri = url.geturl() + uri

        if validators.url(uri):
            t = MyThread(Method='funcBrokenllinks',
                         Name='brokenlinks',
                         Url=uri)
            t.start()
            threads.append(t)

    for t in threads:
        # pdb.set_trace()
        t.join()
        # t.freemem()
        if t.getResult():
            broken_links += 1

    return broken_links
Example #2
0
 def collect(self):
     li = []
     for remark, url in self.urls:
         t = MyThread(self.check, args=(remark, url))
         li.append(t)
         t.start()
     for t in li:
         t.join()
         yield t.get_result()
Example #3
0
    def __create_workers(self):
        """
        Create new threads for the parser

        :return: Nothing
        """
        for _ in range(Properties.PARSER_MAX_THREADS):
            t = MyThread(HTMLServiceFactory(self.html_service_type).get_instance(), self)
            t.start()
Example #4
0
    def create_workers(self):
        """
        Create new threads for the crawler

        :return: Nothing
        """
        for _ in range(Properties.CRAWLER_MAX_THREADS):
            t = MyThread(
                UrlDAOFactory(self.db_type).get_instance(), self, self.bucket)
            self.__threads.append(t)
            t.start()
Example #5
0
def getImgratio(url):

    total_img_size = 0
    threads = []

    try:
        text_size = url.getsize()
    except WebcredError as e:
        return e.message

    soup = url.getsoup()

    # total_img_size of images
    for link in soup.find_all('img', src=True):
        uri = link.get('src', None)
        if not uri.startswith('http://') and not uri.startswith('https://'):
            uri = url.geturl() + uri

        if validators.url(uri):
            try:
                uri = Urlattributes(uri)
                t = MyThread(Method='funcImgratio', Name='Imgratio', Url=uri)
                t.start()
                threads.append(t)
            except WebcredError as e:
                # even if particular image is not accessible, we don't mind it
                pass

    for t in threads:
        t.join()
        t.freemem()
        size = t.getResult()
        if isinstance(size, int):
            total_img_size += size
        # print total_img_size

    try:
        total_size = total_img_size + text_size
        ratio = float(text_size) / total_size
        # print ratio, text_size, total_size
    except ValueError:
        raise WebcredError('Error in fetching images')

    return ratio
Example #6
0
def main(cross_num=5,
         exp_path='',
         sav_dir='',
         conf='',
         cfg_sec='',
         bool_vad=False):

    if not os.path.exists(exp_path):
        os.mkdir(exp_path)

    for i in range(1, 6):
        session_name = 'Session' + str(i)
        session_path = exp_path + '/' + session_name
        if not os.path.exists(session_path):
            os.mkdir(session_path)

    if not os.path.exists(sav_dir):
        os.mkdir(sav_dir)

    thread_list = []
    lab_list = []
    for i in range(5):
        t = MyThread(
            get_emo_data,
            args=('Session' + str(i + 1), True, bool_vad, 'spectrogram', 3,
                  exp_path + 'Session' + str(i + 1) + '/', conf, cfg_sec))
        thread_list.append(t)

    for t in thread_list:
        t.start()

    for t in thread_list:
        t.join()
        lab_list.append(t.get_result())

    lab_dict = dict()
    for item in lab_list:
        lab_dict.update(item)

    min_val = np.Inf

    freq_bag = set()
    time_bag = set()

    for sess_spk in lab_dict.keys():
        # sess_spk aims to specific session of F/M,
        # wav_file_dict: {wav_file_name: [[mat1,lab1], [mat2, lab2] ...], wav_file_name: [[mat1,lab1], [mat2,
        # lab2] ...] , ..}
        wav_file_info_dict = lab_dict.get(sess_spk)
        print(sess_spk, len(list(wav_file_info_dict.keys())))
        for wav_file in wav_file_info_dict:
            info_list = wav_file_info_dict.get(wav_file)
            for mat, label, valence, arouse, domain in info_list:
                min_val = min(min_val, mat.min())
                # max_length = max(max_length, mat.shape[1])
                # min_length = min(min_length, mat.shape[1])
                time_bag.add(mat.shape[1])
                freq_bag.add(mat.shape[0])
    #
    print('Time bag:\n\t', time_bag)
    print('Freq bag:\n\t', freq_bag)
    print(max(time_bag), min(time_bag))
    bias = min_val - 1
    print(bias)

    sess_spk = set(lab_dict.keys())
    if cross_num == 5:
        for i in range(1, 6):
            # construct the dataset for five-fold cross validation
            cross_val = 'leave_' + str(i)
            file_path = sav_dir + '/' + cross_val
            dev_key = {'Session' + str(i) + '_F'}
            test_key = {'Session' + str(i) + '_M'}
            train_key = sess_spk - test_key - dev_key
            if not os.path.exists(file_path):
                os.mkdir(file_path)
            print(test_key)
            store(file_path=file_path,
                  train_key=train_key,
                  test_key=test_key,
                  data_dict=lab_dict,
                  bias=bias,
                  bool_vad=bool_vad)
            print()
    else:
        sess_spk_list = list(sess_spk)
        # construct the dataset for ten-fold cross validation
        for i in range(1, 11):
            idx = sess_spk_list[i - 1]
            cross_val = 'leave_' + str(i)
            file_path = data_path_prefix + '10cross_set/' + cross_val
        test_key = {idx}
        train_key = sess_spk - test_key
        if not os.path.exists(file_path):
            os.mkdir(file_path)
        store(file_path=file_path,
              train_key=train_key,
              test_key=test_key,
              data_dict=lab_dict)
Example #7
0
        if(ret_code!=0):
            raise Exception("section[{}] func[{}] error[{}]".format(section, 'deal_csv', ret_msg))  

    threads=[]
    threads_num=0
    for section in section_list_multiple:
        if(section_name!=None):
            if(section!=section_name):
                continue
        file_dict=config_dic.get(section, None)
        if(file_dict==None):
            raise Exception("section[{}] not found".format(section))  

        t=MyThread(deal_csv,args=(section, file_dict, glob_config, relation_ds))
        threads.append(t)
        t.start()
        threads_num+=1
        # print("\n\nNow begin deal section[{}]".format(section))
        # ret_code, ret_msg, relation_ds, out_ds =deal_csv(file_dict, glob_config, relation_ds, out_ds)
        # if(ret_code!=0):
        #     raise Exception("section[{}] func[{}] error[{}]".format(section, 'deal_csv', ret_msg))  

    for t in threads:
        t.join()  # 一定要join,不然主线程比子线程跑的快,会拿不到结果
        ret_code, ret_msg, relation_ds, out_ds_list[t.args[0]] = t.get_result()
        if(ret_code!=0):
            raise Exception("thread[{}] func[{}] error[{}]".format(t, 'deal_csv', ret_msg))  

    # out_ds=pd.DataFrame({'openday':[], 'detail_type':[], 'detail_cnt':[], 'detail_amt':[]})
    i=0
    out_ds=pd.DataFrame({'openday':[], 'detail_type':[], 'detail_cnt':[], 'detail_amt':[]})