Exemple #1
0
def matching_confidence(video, subtitle):
    """Compute the confidence that the subtitle matches the video.
    Returns a float between 0 and 1. 1 being the perfect match."""
    guess = guessit.guess_file_info(subtitle.release, "autodetect")
    video_keywords = utils.get_keywords(video.guess)
    subtitle_keywords = utils.get_keywords(guess) | subtitle.keywords
    replacement = {"keywords": len(video_keywords & subtitle_keywords)}
    if isinstance(video, videos.Episode):
        replacement.update({"series": 0, "season": 0, "episode": 0})
        matching_format = "{series:b}{season:b}{episode:b}{keywords:03b}"
        best = matching_format.format(series=1, season=1, episode=1, keywords=len(video_keywords))
        if guess["type"] in ["episode", "episodesubtitle"]:
            if "series" in guess and guess["series"].lower() == video.series.lower():
                replacement["series"] = 1
            if "season" in guess and guess["season"] == video.season:
                replacement["season"] = 1
            if "episodeNumber" in guess and guess["episodeNumber"] == video.episode:
                replacement["episode"] = 1
    elif isinstance(video, videos.Movie):
        replacement.update({"title": 0, "year": 0})
        matching_format = "{title:b}{year:b}{keywords:03b}"
        best = matching_format.format(title=1, year=1, keywords=len(video_keywords))
        if guess["type"] in ["movie", "moviesubtitle"]:
            if "title" in guess and guess["title"].lower() == video.title.lower():
                replacement["title"] = 1
            if "year" in guess and guess["year"] == video.year:
                replacement["year"] = 1
    else:
        return 0
    confidence = float(int(matching_format.format(**replacement), 2)) / float(int(best, 2))
    return confidence
def matching_confidence(video, subtitle):
    '''Compute the confidence that the subtitle matches the video.
    Returns a float between 0 and 1. 1 being the perfect match.'''
    guess = guessit.guess_file_info(subtitle.release, 'autodetect')
    video_keywords = utils.get_keywords(video.guess)
    subtitle_keywords = utils.get_keywords(guess) | subtitle.keywords
    replacement = {'keywords': len(video_keywords & subtitle_keywords)}
    if isinstance(video, videos.Episode):
        replacement.update({'series': 0, 'season': 0, 'episode': 0})
        matching_format = '{series:b}{season:b}{episode:b}{keywords:03b}'
        best = matching_format.format(series=1, season=1, episode=1, keywords=len(video_keywords))
        if guess['type'] in ['episode', 'episodesubtitle']:
            if 'series' in guess and guess['series'].lower() == video.series.lower():
                replacement['series'] = 1
            if 'season' in guess and guess['season'] == video.season:
                replacement['season'] = 1
            if 'episodeNumber' in guess and guess['episodeNumber'] == video.episode:
                replacement['episode'] = 1
    elif isinstance(video, videos.Movie):
        replacement.update({'title': 0, 'year': 0})
        matching_format = '{title:b}{year:b}{keywords:03b}'
        best = matching_format.format(title=1, year=1, keywords=len(video_keywords))
        if guess['type'] in ['movie', 'moviesubtitle']:
            if 'title' in guess and guess['title'].lower() == video.title.lower():
                replacement['title'] = 1
            if 'year' in guess and guess['year'] == video.year:
                replacement['year'] = 1
    else:
        return 0
    confidence = float(int(matching_format.format(**replacement), 2)) / float(int(best, 2))
    return confidence
 def list(self, video, languages):
     languages = languages & self.availableLanguages()
     if not languages:
         self.logger.debug(u'No language available')
         return []
     if not self.isValidVideo(video):
         self.logger.debug(u'Not a valid video')
         return []
     results = []
     if isinstance(video, Episode):
         results = self.query(video.path or video.release, languages, get_keywords(video.guess), series=video.series, season=video.season, episode=video.episode)
     elif isinstance(video, Movie) and video.year:
         results = self.query(video.path or video.release, languages, get_keywords(video.guess), movie=video.title, year=video.year)
     return results
Exemple #4
0
def filterchain(request,
                app,
                model,
                field,
                foreign_key_app_name,
                foreign_key_model_name,
                foreign_key_field_name,
                value,
                manager=None):
    model_class = get_model(app, model)
    m2m = is_m2m(model_class, field)
    keywords = get_keywords(field, value, m2m=m2m)
    # filter queryset using limit_choices_to
    limit_choices_to = get_limit_choices_to(foreign_key_app_name,
                                            foreign_key_model_name,
                                            foreign_key_field_name)
    queryset = get_queryset(model_class, manager, limit_choices_to)

    results = queryset.filter(**keywords)

    # Sort results if model doesn't include a default ordering.
    if not getattr(model_class._meta, 'ordering', False):
        results = list(results)
        sort_results(results)

    serialized_results = serialize_results(results)
    results_json = json.dumps(serialized_results)
    return HttpResponse(results_json, content_type='application/json')
Exemple #5
0
def filterchain_all(request, app, model, field, foreign_key_app_name,
                    foreign_key_model_name, foreign_key_field_name, value):
    """Returns filtered results followed by excluded results below."""
    model_class = get_model(app, model)
    keywords = get_keywords(field, value)
    # filter queryset using limit_choices_to
    limit_choices_to = get_limit_choices_to(foreign_key_app_name,
                                            foreign_key_model_name,
                                            foreign_key_field_name)
    queryset = get_queryset(model_class, limit_choices_to=limit_choices_to)

    filtered = list(queryset.filter(**keywords))
    sort_results(filtered)

    excluded = list(queryset.exclude(**keywords))
    sort_results(excluded)

    # Empty choice to separate filtered and excluded results.
    empty_choice = {'value': "", 'display': "---------"}

    serialized_results = (serialize_results(filtered) + [empty_choice] +
                          serialize_results(excluded))

    results_json = json.dumps(serialized_results)
    return HttpResponse(results_json, content_type='application/json')
 def list(self, video, languages):
     languages = languages & self.availableLanguages()
     if not languages:
         self.logger.debug(u'No language available')
         return []
     if not self.isValidVideo(video):
         self.logger.debug(u'Not a valid video')
         return []
     results = self.query(video.path or video.release, languages, get_keywords(video.guess), video.series, video.season, video.episode)
     return results
    def apply_(self, p1, p2, p3, p4, key):
        grab_screen(p1, p2, "data/pic_ques.png")
        grab_screen(p3, p4, "data/pic_ans.png")
        merge_pic("data/pic_ques.png", "data/pic_ans.png")
        r = pic_handle("data/target_img.png", key)
        if r:
            res = r
            # print("words result: ", res)
            ques_content, ans_content = handle_words(res)
            keywords = get_keywords(ques_content)
            data_res = search_related_records(keywords)
            print(data_res, "data_res", len(data_res))

            if len(data_res) >= 1:  # 数据库中存在该题目
                real_ans, final_index = get_the_most_similar(ques_content, [r[1] for r in data_res])
                real_ans = data_res[final_index][2]
                pno = data_res[final_index][0]
                if real_ans:  # find correct answer
                    given_ans, index = get_the_most_similar(real_ans, ans_content)
                    position: str = self.find_position_by_index(index+1)
                    print(real_ans)
                    if self.auto_apply:
                        mouse_click_(position)
                else:  # not find correct answer but find wrong answer
                    wrong_ans = data_res[final_index][3]
                    print("题目尚无正确答案,进行随即作答")
                    index = auto_choose_answer(wrong_ans, ans_content)
                    print("index: ", index)
                    position = self.find_position_by_index(index)
                    mouse_click_(position)
                    wrong_ans = json.loads(wrong_ans)
                    correct = sg.popup_yes_no("选择对了吗?", keep_on_top=True)
                    if correct == "Yes":
                        if len(wrong_ans) < 4:
                            engine.update_or_insert(pno=pno, ques=ques_content, ans=ans_content[len(wrong_ans)])
                    else:
                        if len(wrong_ans) < 4:
                            engine.update_or_insert(pno=pno, ques=ques_content, wrong_ans=[ans_content[len(wrong_ans)]])

            else:  # 未收录的情况进行顺序作答,记录正确错误
                print("题目可能未收录, 进行随机作答")
                real_ans = ""
                # index = self.auto_choose_answer("", ans_content)
                position = self.find_position_by_index(1)
                mouse_click_(position)
                correct = sg.popup_yes_no("选择对了吗?", keep_on_top=True)  # 百度文字识别每日字数限制,这里选择手动识别
                if correct == "Yes":
                    engine.update_or_insert(ques=ques_content, ans=ans_content[0])  # first time apply 0
                else:
                    engine.update_or_insert(ques=ques_content, wrong_ans=ans_content[:1])  # wrong_ans must be a list
            print(real_ans)
            self.start_btn.set_focus(force=True)
Exemple #8
0
def test_spider():
    """
    test spider
    """
    config = JsonConf.load('./../conf.json')

    keys = config['url_api'].keys()
    fetcher = createInstance('url_crawler',
                             'UrlFetcher',
                             max_repeat=2,
                             sleep_time=1)
    parser = createInstance('url_crawler', 'UrlParser', max_deep=1)
    saver = createInstance('url_crawler', 'UrlSaver', config)
    # if need_proxy == '1':
    proxieser = createInstance('url_crawler', 'UrlProxieser', sleep_time=1)
    # else:
    #     proxieser = None

    # initial web_spider
    web_spider = WebSpider(fetcher,
                           parser,
                           saver,
                           proxieser,
                           monitor_sleep_time=1)
    keywords = get_keywords()
    # urls = []
    for i in keywords:
        for key in keys:
            api = config['url_api'][key]
            url = api.format(i)
            web_spider.set_start_url(url,
                                     keys={
                                         'key': key,
                                         'replace':
                                         config['json_replace'][key],
                                         'depth': config['url_depth'][key],
                                         'need_proxy':
                                         config['need_proxy'][key]
                                     })
        # web_spider.start_working(fetcher_num=2)
        # # wait for finished
        # web_spider.wait_for_finished()

    web_spider.start_working(fetcher_num=10)
    # wait for finished
    web_spider.wait_for_finished()

    return
Exemple #9
0
def create_lateral_pseudo_steps():
    lateral_files = files_folder.joinpath('lateral')
    element_data = pd.read_excel(lateral_files.joinpath('element_data.xlsx'),
                                 header=None)
    inital_gap = pd.read_excel(lateral_files.joinpath('initial_gap.xlsx'),
                               header=None)
    surface_behav = pd.read_excel(lateral_files.joinpath('surface_behav.xlsx'),
                                  header=None)
    surface_behav = list(surface_behav.ffill().groupby(0).apply(
        lambda df: df.iloc[:, 1:].values))
    assert len(surface_behav) == inital_gap.shape[0] == 61
    max_count = len(surface_behav)
    pseudo_steps = list()
    for i in range(max_count):
        pseudo_step = Step(
            get_keywords(lateral_files.joinpath('lateral_template.txt')))

        element_kw = pseudo_step['Element']

        # Update *Element
        new_name = element_kw.params['ELSET'][:-1] + str(i + 1)
        element_kw.params['ELSET'] = new_name

        # Read i'th row from element_data.xlsx and insert it into element_kw.data
        element_datum = element_data.iloc[i].tolist()
        element_kw.data = [element_datum]

        # Update *GAP
        gap_kw = pseudo_step['GAP']
        gap_kw.params['ELSET'] = new_name

        gap_kw.data[0][0] = -inital_gap.iloc[i, 0]
        if i == 0 or i == max_count - 1:  # if first or last, then ignore
            pass
        else:  # else multiply by 2
            gap_kw.data[0][-1] = float(gap_kw.data[0][-1]) * 2

        # Update *SURFACE BEHAVIOR
        surfbeh_kw = pseudo_step['SURFACE BEHAVIOR']
        surfbeh_kw.data = list(surface_behav[i])
        pseudo_steps.append(pseudo_step)

    write_steps(pseudo_steps,
                results_folder.joinpath('lateral_pseudo_steps.txt'))
def generate_data(files):
    """given a list of files, returns ..."""
    data = []
    current_ids = 0
    for file in files:
        text = open(file,'r').read()
        doc = nlp(text)
        for i, phrase in enumerate(doc.sents, current_ids):
            phrase = str(phrase)
            if ('\n' in phrase[:-2]):
                continue
            keywords = get_keywords(phrase)
            if len(keywords) > 3:
                data.append({"sentence": phrase,
                            "keywords": keywords,
                            "source": os.path.basename(file)})
                current_ids += 1
    with open('data.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
Exemple #11
0
def create_end_pseudo_steps():
    end_files = files_folder.joinpath('end')
    end_data = pd.read_excel(end_files.joinpath('element_data_end.xlsx'),
                             header=None)
    area_data = pd.read_excel(end_files.joinpath('area_data.xlsx'),
                              header=None)
    qz_curve = pd.read_excel(end_files.joinpath('qz_curve.xlsx'), header=None)

    max_count = 21
    pseudo_steps = list()
    for i in range(max_count):
        pseudo_step = Step(get_keywords(
            end_files.joinpath('end_template.txt')))

        element_kw = pseudo_step['Element']

        # Update *Element
        new_name = element_kw.params['ELSET'][:-1] + str(i + 1)
        element_kw.params['ELSET'] = new_name

        # Read i'th row from element_data_end.xlsx and insert it into element_kw.data
        element_datum = end_data.iloc[i].tolist()
        element_kw.data = [element_datum]

        # Update *GAP
        gap_kw = pseudo_step['GAP']
        gap_kw.params['ELSET'] = new_name

        # Replace only the last value
        gap_kw.data[0][-1] = area_data.iloc[i, 0]

        # Update *SURFACE BEHAVIOR
        surfbeh_kw = pseudo_step['SURFACE BEHAVIOR']
        surfbeh_kw.data = list(qz_curve.values)
        pseudo_steps.append(pseudo_step)

    write_steps(pseudo_steps, results_folder.joinpath('end_pseudo_steps.txt'))

    pass
Exemple #12
0
                continue
            if (looking_concept in words):
                total = total + 1
                concept = words[0] + " " + words[1] + " " + words[2]

                concept = masking(concept, looking_concept)
                mapped_concept = relation_mapping(words, concept)

                sentiment_dict = sid_obj.polarity_scores(mapped_concept)
                if sentiment_dict['compound'] >= 0.05:
                    pos_count = pos_count + 1
                elif sentiment_dict['compound'] <= -0.05:
                    neg_count = neg_count + 1
                else:
                    neut_count = neut_count + 1

                outfile.write(mapped_concept + " " +
                              str(sentiment_dict['compound']) + "\n")
        write_results(outfile, looking_concept, total, neg_count, pos_count,
                      neut_count)


if __name__ == "__main__":
    all_concepts_long = []
    infile = open("./../data/ConceptNet_data", "r")
    for line in infile.readlines():
        all_concepts_long.append(line)

    key_words = get_keywords()
    sentiment_analyzer(key_words, all_concepts_long)
def onMessage(bot, update, user_data):

    global first
    global specified_disease
    global data
    global information_hospitals
    global lat
    global lon

    if update.message.chat.id not in users:
        users.append(update.message.chat.id)
    else:
        first = False

    if first:  # First time, it displays a presentation message
        username = update.message.chat.first_name
        if username is None:
            username = "******"
        else:
            username = "******" + username

        first = False
        msg = tr2english(update.message.text.lower(), user_data)
        message1 = "Hi" + username + "! I'm your Health Mate. I'm here to inform you about rare diseases. Please don't hesitate to ask me any question you have about it, I'll try my best to answer your doubts using trustful sources. Remember that I'm just here to inform you; for anything else contact a professional."
        message2 = "First of all, I would like you to tell me which disease you want to find out about. Please, tell me the name of the disease."
        bot.send_message(chat_id=update.message.chat_id,
                         text=tr2other(message1, user_data['language']))
        bot.send_message(chat_id=update.message.chat_id,
                         text=tr2other(message2, user_data['language']))
        specified_disease = False
        return

    else:
        msg = tr2english(update.message.text.lower(), user_data)
        keywords = get_keywords(msg)
        if specified_disease == False:
            for i in diseases:
                if i in msg:
                    data = read_book("main_data_" + i + ".json")
                    information_hospitals = read_book(
                        'hospitals_information_' + i + '.json')
                    message = "Perfect!. I understand you asked about " + i + ". From now on we will be talking about this disease in particular."
                    message2a = "If I did not understand it wel, or you wish to change the subject, you can type"
                    message2b = "and I will ask again."
                    bot.send_message(chat_id=update.message.chat_id,
                                     text=tr2other(message,
                                                   user_data['language']))
                    bot.send_message(
                        chat_id=update.message.chat_id,
                        text=tr2other(message2a, user_data['language']) +
                        ' /disease ' +
                        tr2other(message2b, user_data['language']))
                    bot.send_message(
                        chat_id=update.message.chat_id,
                        text=tr2other(
                            'You can ask me any question related to this disease.',
                            user_data['language']))
                    bot.send_message(
                        chat_id=update.message.chat_id,
                        text=tr2other(
                            'You can also send me your location and I will find the closest hospital where this disease can be diagnosed and treated.',
                            user_data['language']))
                    bot.send_message(
                        chat_id=update.message.chat_id,
                        text=tr2other('You can type in',
                                      user_data['language']) + ' /help ' +
                        tr2other('To know about all the text commands.',
                                 user_data['language']))
                    specified_disease = True
                    return
                else:
                    bot.send_message(
                        chat_id=update.message.chat_id,
                        text=tr2other(
                            "Sorry, I did not understand you. Please try again. The diseases I have information about are: "
                            + diseases[0] + '.', user_data['language']))
                    return

        elif 'hello' in keywords or 'hi' in keywords or 'greetings' in keywords:

            bot.send_message(chat_id=update.message.chat_id,
                             text=tr2other("Hello!", user_data['language']))

        elif 'bye' in keywords or 'goodbye' in keywords:

            bot.send_message(chat_id=update.message.chat_id,
                             text=tr2other("Goodbye! Thanks for trusting me.",
                                           user_data['language']))

        elif 'thank' in keywords or 'appreciate' in keywords or 'thanks' in keywords:

            bot.send_message(
                chat_id=update.message.chat_id,
                text=tr2other(
                    "You are welcome! I am always trying to give my best",
                    user_data['language']))

        else:
            message = tr2other('Okay, give me some time to think about it...',
                               user_data['language'])
            bot.send_message(chat_id=update.message.chat_id,
                             text=message,
                             parse_mode=telegram.ParseMode.MARKDOWN)

            info = process_message(msg, data)
            if info:
                source_message = f"In case you want more information about it, the information I found comes from this source:\n `{info['URL']}` \n From the section *{info['title']}*"
                answer = f"Alright! I found the following information: \n \n {info['text']}"
                bot.send_message(chat_id=update.message.chat_id,
                                 text=tr2other(answer, user_data['language']),
                                 parse_mode=telegram.ParseMode.MARKDOWN)
                bot.send_message(chat_id=update.message.chat_id,
                                 text=tr2other(source_message,
                                               user_data['language']),
                                 parse_mode=telegram.ParseMode.MARKDOWN)
            else:
                error_msg = 'I am sorry, but I cannot answer this properly. You can try asking it a different way or contact a specialist for further information'
                bot.send_message(chat_id=update.message.chat_id,
                                 text=tr2other(error_msg,
                                               user_data['language']),
                                 parse_mode=telegram.ParseMode.MARKDOWN)
Exemple #14
0
def analysis(fpath: str, extname, imgdir=None, do_drawings=False):
    content = None
    images = []
    # drawings = []

    kw_arr = []
    freq_arr = []
    ph_arr = []
    nw_arr = []
    sum_arr = []

    # if not do_drawings:
    if True:
        if extname == '.txt':
            content = readtxt.read(fpath)

        if extname == '.docx':
            content = readword.readtxt(fpath)
            images = readword.readimg(fpath, imgdir, str(uuid.uuid4()))
        if extname == '.doc':
            content = readword.readtxt(fpath + 'x')
            images = readword.readimg(fpath + 'x', imgdir, str(uuid.uuid4()))

        if extname == '.pptx':
            content = readppt.readtxt(fpath)
            images = readppt.readimg(fpath, imgdir, str(uuid.uuid4()))
        if extname == '.ppt':
            content = readppt.readtxt(fpath + 'x')
            images = readppt.readimg(fpath + 'x', imgdir, str(uuid.uuid4()))

        if extname == '.pdf':
            content = readpdf.readtext(fpath)

    drawings = None
    do_split_drawing = False
    if do_drawings:
        if extname == '.dxf':
            content = readdxf.readtxt(fpath)
            if do_split_drawing:
                drawings = readdxf.split_drawing_byblock(fpath)

        if extname == '.dwg':
            maxtry = 30
            transpath = fpath.replace('.dwg', '.dxf')
            for ii in range(maxtry):
                print(ii)
                time.sleep(3)
                if os.path.isfile(transpath):
                    content = readdxf.readtxt(transpath)
                    if do_split_drawing:
                        drawings = readdxf.split_drawing_byblock(fpath)
                    break

        if extname == '.rar':
            content = readrar.readrar(fpath, rm_prefix=True, maxnames=10)

        if extname == '.zip':
            content = readrar.readzip(fpath, rm_prefix=True, maxnames=10)

    # do analysis
    if content is not None:
        # too long!!!
        total_words_count = len(' '.join(content))
        total_paragraph_count = len(content)
        max_words = 50000
        if total_words_count > max_words:
            paragraph_limit = math.ceil(max_words / total_words_count *
                                        total_paragraph_count)
            content = content[:paragraph_limit]
            print('limit paragraphs ' + str(paragraph_limit))
            print('limit words ' + str(len(' '.join(content))))

        # key words
        kw_arr = utils.get_keywords(content, config.kw_topk)
        # word frequency array
        freq = utils.get_freq(content)
        freq_arr = list(map(lambda x: str(freq[x])
                            if x in freq else 0, kw_arr))
        # key phrases
        ph_arr = utils.get_phrase(content, n=10)
        # new words
        if not extname == '.dwg':
            nw_arr = utils.get_newwords(content, n=20)
        # auto summary
        if extname == '.rar' or extname == '.zip':
            sum_arr = content
        else:
            sum_arr = utils.get_summary(content, n=10)

    # give keywords to images
    # ['fname', 'keywords', 'relatedtxt']
    makeparam = {}
    if images:
        for cimg in images:
            # cimg['keywords'] = ','.join(utils.get_keywords([cimg['relatedtxt']], config.kw_topk_image))
            makeparam[cimg['fname']] = cimg['relatedtxt']

        kwdic = utils.get_keywordsmany(makeparam, config.kw_topk_image)
        for cimg in images:
            cimg['keywords'] = ','.join(kwdic[cimg['fname']][0])
            cimg['newwords'] = ','.join(kwdic[cimg['fname']][1])
            cimg['docname'] = fpath

    return (
        ','.join(kw_arr),
        # ','.join(freq_arr),
        ','.join([x + ':' + y for x, y in zip(kw_arr, freq_arr)]),
        ','.join(ph_arr),
        ','.join(nw_arr),
        sum_arr,
        images,
        drawings)
Exemple #15
0
    def run(self):
        logging.basicConfig(level=logging.INFO,
                            format="%(asctime)s\t%(levelname)s\t%(message)s")

        need_proxy = self.config.getStr('need_proxy', self.name)

        fetcher = createInstance('url_crawlers',
                                 self.name + 'UrlFetcher',
                                 max_repeat=2,
                                 sleep_time=1)
        parser = createInstance('url_crawlers',
                                self.name + 'UrlParser',
                                max_deep=1)
        saver = createInstance('url_crawlers', self.name + 'UrlSaver')
        if need_proxy == '1':
            proxieser = createInstance('url_crawlers',
                                       self.name + 'UrlProxieser',
                                       sleep_time=1)
        else:
            proxieser = None

        # initial web_spider
        web_spider = WebSpider(self.name,
                               fetcher,
                               parser,
                               saver,
                               proxieser,
                               monitor_sleep_time=5)

        keywords = get_keywords()
        # urls = []
        api = self.config.getStr('url_api', self.name)
        for i in keywords:
            url = api.format(i)
            web_spider.set_start_url(
                url,
                keys={
                    'website': self.name,
                    'keyword': i,
                    'params': {
                        'jsv':
                        '2.3.16',
                        'appKey':
                        '12574478',
                        't':
                        None,
                        'sign':
                        None,
                        'api':
                        'mtop.taobao.wsearch.h5search',
                        'v':
                        '1.0',
                        'H5Request':
                        'true',
                        'ecode':
                        '1',
                        'type':
                        'jsonp',
                        'dataType':
                        'jsonp',
                        'callback':
                        'mtopjsonp1',
                        'data':
                        '{{"q":"{0}","search":"提交","tab":"{2}","sst":"1","n":20,"buying":"buyitnow","m":"api4h5","token4h5":"","abtest":"29","wlsort":"29","page":{1}}}'
                        .format(i, 1,
                                'all' if self.name == 'TaoBao' else 'mall')
                    }
                })
        web_spider.start_working(fetcher_num=2)
        # wait for finished
        web_spider.wait_for_finished()
Exemple #16
0
# Package Imports
import authorization
import utils
import settings

# Help Text
helpTxt = settings.get_actions()['/newevent']['helpTxt']

# Candidate Worksheets
sheetNames = ['Socials', 'Professional Events']
socialSheet, profSheet = authorization.get_sheet_objects(sheetNames)

# Keywords - Type attribute
keywords = ['social', 'prof']
socials, profs = utils.get_keywords(keywords)

# Row values on spreadsheet for gspread to retrieve correct values
NAME_ROW = 2
PWD_ROW = 1
"""
Add event into the Candidate Tracker spreadsheet
@params: event, name, pwd - attributes needed to insert into spreadsheet
@return: err - if err is None then successful add 
"""


def add_event(event, name, pwd):
    # Figure out exact event worksheet
    worksheet = None
    if event == 'social':
Exemple #17
0
def get_first_summaries(text, stopwords, model):
    """

    :param text: 文档
    :param stopwords: 停用词
    :param model: 词向量模型
    :return: 摘要列表  按照权重从大到小排列[(句子,权重),(句子,权重)]
    """
    #获取(位置,句子)列表
    sentences = utils.get_sentences(text)

    #获取句子列表
    sen_lis = [x[1] for x in sentences]
    # print(sen_lis)
    #获取文档向量
    docvec = generate_vector.doc_vector(text, stopwords, model)

    #获取句子向量列表
    sen_vecs = []
    for i in range(len(sen_lis)):
        #假设是首句
        if i == 0:
            sen_vecs.append(
                generate_vector.sentence_vector(sen_lis[i], stopwords, model) *
                GlobalParameters.locFirst_weight)
        #如果是最后一句
        elif i == len(sen_lis) - 1:
            sen_vecs.append(
                generate_vector.sentence_vector(sen_lis[i], stopwords, model) *
                GlobalParameters.locLast_weight)
        #如果是中间的句子
        else:
            sen_vecs.append(
                generate_vector.sentence_vector(sen_lis[i], stopwords, model))

    #计算余弦值列表
    cos_lis = [utils.cos_dist(docvec, x) for x in sen_vecs]

    #计算关键词权重列表
    #获取关键词
    keywords = utils.get_keywords(text)

    #计算权重
    keyweights = [utils.keyword_weight(x, keywords) for x in sen_lis]

    #计算长度权重
    len_weigths = [utils.len_weight(x) for x in sen_lis]

    #根据余弦相似度 关键词权重 长度权重 计算每个句子最终权重
    final_weights = [
        cos * keyword * length for cos in cos_lis for keyword in keyweights
        for length in len_weigths
    ]

    #形成最后的(句子,权重列表)
    final_lis = []
    for sen, weight in zip(sen_lis, final_weights):
        final_lis.append((sen, weight))

    #将句子按照权重大小 从高到低排序
    final_lis = sorted(final_lis, key=lambda x: x[1], reverse=True)

    #取出第一次摘要的橘子个数
    final_lis = final_lis[:GlobalParameters.first_num]

    return final_lis
Exemple #18
0
    def get_keyword(self):
        return self.keyword

    def get_num_linha(self):
        return self.num_linha

    def get_sentenca(self):
        return self.sentenca


# Percorrer todos os prontuarios. Para cada prontuário, verificar
# a ocorrência das palavras-chave. Guardar as sentenças onde as
# palavras-chave foram encontradas.
prontuarios = utils.get_prontuarios()
keywords = utils.get_keywords()
lista_ocorrencias = []
for prontuario in prontuarios:
    for keyword in keywords:
        linhas_do_prontuario = utils.get_linhas_arq_texto(prontuario)
        for num_da_linha in range(len(linhas_do_prontuario)):
            linha = linhas_do_prontuario[num_da_linha]
            sentenca = utils.extrair_sentenca(linha, keyword)
            if sentenca:
                ocorrencia = Ocorrencia(prontuario, num_da_linha, keyword,
                                        sentenca)
                lista_ocorrencias.append(ocorrencia)

# Gerar o relatório
relatorio.gerar(prontuarios, keywords, lista_ocorrencias)
Exemple #19
0
    print(datetime.now().strftime("%m/%d/%Y, %H:%M:%S | ") + 'Making Keyword & Hint table ~')
    keywords = news.groupby('category').apply(get_keywords)

    t = tqdm(total=25)
    for category, keywordss in tqdm(keywords.iteritems()):
        # insert keywords
        db.insert_keywords(keywordss, category, date, time)
        news_cat = news[news['category']==category]
        for keyword in keywordss:
            # news for keyqord
            news_keyword = news_cat[news_cat['text'].str.contains(keyword)]
            n = len(news_keyword)
            news_keyword = pd.concat([news_keyword, get_search_news(keyword, news=n).iloc[:n]], axis=0)

            # hints
            hints = get_keywords(news_keyword, only_noun=True, n=10)
            if keyword in hints:
                hints.remove(keyword)
            hints = hints[:9]
            # insert hints
            keywordID = db.keywordID(keyword, db.categoryID(category))
            db.insert_hints(keywordID, hints)

            # post
            categoryID = db.categoryID(category)
            for i in range(len(news_keyword)):
                postTitle = news_keyword.iloc[i]['title']
                postDetail = news_keyword.iloc[i]['text']
                userID = 'test1'

                a = (postTitle, postDetail, userID, keywordID, categoryID, date)
Exemple #20
0
    print(datetime.now().strftime("%m/%d/%Y, %H:%M:%S | ") +
          'Making Keyword & Hint table ~')
    keywords = news.groupby('category').apply(get_keywords)

    t = tqdm(total=25)
    for category, keywordss in tqdm(keywords.iteritems()):
        db.insert_keywords(keywordss, category, date, time)
        news_cat = news[news['category'] == category]
        for keyword in keywordss:
            news_keyword = news_cat[news_cat['text'].str.contains(keyword)]
            n = len(news_keyword)
            news_search = get_search_news(keyword, news=n).iloc[:n]

            hints = get_keywords(pd.concat([news_keyword, news_search],
                                           axis=0),
                                 only_noun=True,
                                 n=10)
            if keyword in hints:
                hints.remove(keyword)
            hints = hints[:9]

            keywordID = db.keywordID(keyword, db.categoryID(category))
            db.insert_hints(keywordID, hints)
            t.update(1)
    t.close()
    print('\033[92m' + 'COMPLETE!' + '\033[0m')

    db.close()
    print(datetime.now().strftime("%m/%d/%Y, %H:%M:%S | ") + '\033[94m' +
          'Bye~' + '\033[0m')