Esempio n. 1
0
    def get_title(self):
        if self.title is not None:
            return self.title

        site = format_site_from_url(self.url)

        try:
            result = []
            def process_website(result):
                browser = Browser()
                browser.set_handle_robots(False)
                browser.open(self.url, timeout=9.00)
                result.append(browser)
            thread = threading.Thread(target=process_website, args=(result,))
            thread.start()
            thread.join(timeout=10)
            if len(result) == 0:
                raise Exception("browser timedout or failed")
            browser = result[0]
            self.title = "[%s] %s" % (site.encode("Utf-8"), encoding_sucks(clean_title(browser.title())).lower().capitalize())
            self.langue = get_langue_from_html(browser.response().get_data())
            self.save()
            return self.title
        except Exception as e:
            print "Error: fail on %s: %s" % (self.url, e)
            self.title = "[%s] Error: couldn't fetch the title" % site
            self.save()
            return self.title
    def read(self):
        soup = downloader.read_soup(self.url)
        for div in soup.findAll('div', class_='fileText'):
            href = urljoin(self.url, div.a['href'])
            img = Image(href, self.url, len(self.urls))
            self.urls.append(img.url)

        board = self.url.split('/')[3]
        title = soup.find('span', class_='subject').text
        id_ = int(self.url.split('/thread/')[1].split('/')[0])
        self.title = clean_title(f'[{board}] {title} ({id_})')
    def read(self):
        ui_setting = self.ui_setting
        cw = self.customWidget
        print_ = get_print(cw)
        if self.yt_type == 'video':
            res = get_resolution()
            info = get_videos(self.url,
                              type=self.yt_type,
                              max_res=res,
                              only_mp4=False,
                              audio_included=not True,
                              cw=cw)
        else:
            abr = get_abr()
            info = get_videos(self.url, type=self.yt_type, max_abr=abr, cw=cw)
        videos = info['videos']

        cw.enableSegment(overwrite=True)

        # first video must be valid
        while videos:
            video = videos[0]
            try:
                video.url()
                break
            except Exception as e:
                print(e)
                videos.remove(video)
        else:
            raise Exception('No videos')

        if len(videos) > 1:
            p2f = get_p2f(cw)
            if p2f:
                self.single = False
                self.title = clean_title(info['title'])
                self.urls = [video.url for video in videos]
                video = videos[0]
                self.setIcon(video.thumb)
                return
            else:
                video = videos.pop(0)
                cw.gal_num = cw.url = video.url._url
                if videos and cw.alive:
                    s = u', '.join(video.url._url for video in videos)
                    self.exec_queue.put(([s, {
                        'youtube': cw.format
                    }], 'downButton(cw[0], format_selector=cw[1])'))

        self.urls.append(video.url)
        self.artist = video.username
        self.setIcon(video.thumb)

        self.title = video.title
Esempio n. 4
0
 def read(self):
     if '/post/' in self.url:
         raise errors.Invalid(
             tr_('개별 다운로드는 지원하지 않습니다: {}').format(self.url))
     self._popular = 'search-Popular.' in self.url
     self.title = clean_title(self.name)
     qs = query_url(self.url)
     q = qs['q'][0]
     for id in get_ids_multi(q, self._popular, self.cw):
         img = Image(id, self.url)
         self.urls.append(img.url)
 def read(self):
     ##        loop = asyncio.new_event_loop()
     ##        asyncio.set_event_loop(loop)
     try:
         info = get_info(self.url, self.cw)
         for img in info['imgs']:
             self.urls.append(img.url)
         self.title = clean_title(info['title'])
     finally:
         ##            loop.close()
         pass
def get_title(soup, cw=None):
    print_ = get_print(cw)
    for h1 in soup.findAll('h1'):
        title = h1.text.strip()
        if title:
            break
    else:
        raise Exception('no title')
    title_clean = clean_title(title)
    print_('get_title: "{}"({}) "{}"({})'.format(title, title.encode('utf8'), title_clean, title_clean.encode('utf8')))
    return title_clean
Esempio n. 7
0
 def __init__(self, type, url, title, referer, p=0):
     self.type = type
     self.url = LazyUrl(referer, lambda _: url, self)
     ext = os.path.splitext(url.split('?')[0])[1]
     if ext.lower() == '.php':
         ext = '.mp4'
     if type == 'video':
         self.filename = clean_title('{}{}'.format(title, ext))
     else:
         self.filename = '{}{}'.format(p, ext)
     self.title = title
Esempio n. 8
0
    def read(self):
        title = clean_title(self.soup.find('h1').text.strip())
        self.title = tr_(u'\uc77d\ub294 \uc911... {}').format(title)
        imgs = get_imgs_all(self.url, title, cw=self.cw)
        for img in imgs:
            if isinstance(img, Image):
                self.urls.append(img.url)
            else:
                self.urls.append(img)

        self.title = title
def get_video(url, soup=None):
    if soup is None:
        html = downloader.read_html(url)
        soup = Soup(html)
        
    video = soup.find('video', id='vjsplayer').find('source').attrs['src']
    url_thumb = soup.find('video', id='vjsplayer').attrs['poster']
    title = get_title(soup)
    filename = u'{}.mp4'.format(clean_title(title))
    video = Video(video, url_thumb, url, filename)
    return video
Esempio n. 10
0
    def read(self):
        token_guild_id_list = self.url.split(
            "/"
        )  # 값을 어떻게 받을지 몰라서 일단 나눴어요. discord_이메일/비밀번호/서버아이디 또는 discord_토큰/서버아이디 이런식으로 받게 해놨어요.

        if len(token_guild_id_list) == 2:
            token = token_guild_id_list[0]
            guild_id = token_guild_id_list[1]
        elif len(token_guild_id_list) == 3:
            email = token_guild_id_list[0]
            password = token_guild_id_list[1]
            guild_id = token_guild_id_list[2]

            response = self.post_account_info(email, password)
            account_info = response.json()
            if response.status_code == 400:
                if account_info.get("captcha_key"):
                    raise errors.Invalid("먼저 웹 또는 디스코드 앱에서 로그인하신후 캡차를 인증해주세요."
                                         )  # 메세지 박스 return하니까 멈춰서 raise로 해놨어요
                else:
                    raise errors.Invalid(
                        "이메일 또는 비밀번호가 잘못되었습니다. 확인후 다시 시도해주세요.")
            else:
                if not account_info["token"]:
                    raise errors.Invalid(
                        "토큰을 받아오지 못했어요. 2단계인증을 사용중이신경우 토큰을 이용해 요청해주세요.")
                else:
                    token = account_info["token"]
        else:
            raise errors.Invalid("인자값이 더 많이왔어요.")

        guild_info_response = self.get_emoji_list(
            token, int(guild_id))  # 토큰과 함께 get요청함

        if guild_info_response.status_code != 200:
            raise errors.Invalid(
                "정상적인 토큰이 아니거나 서버를 찾을수없어요. 맞는 토큰인지, 해당 서버에 접속해있는지 확인해주세요.")
        else:
            guild_info = guild_info_response.json()

        if guild_info["emojis"]:
            base_url = "https://cdn.discordapp.com/emojis/"
            for emoji in guild_info["emojis"]:  # 이모지 리스트로 가져옴
                if emoji["animated"] is True:  # 만약 gif면 gif 다운로드
                    param = emoji["id"] + ".gif"
                else:  # 아닐경우 png로
                    param = emoji["id"] + ".png"

                self.title = clean_title(
                    f'{guild_info["name"]}({guild_info["id"]})'  # 폴더 이름은 서버 이름, id
                )
                self.urls.append(base_url + param + "?v=1")  # 인자 합치기
        else:
            raise errors.Invalid("해당 서버에는 이모지가 없어요")
 def id(self):
     if self.type_sankaku == 'www':
         id = u'[www] ' + self.soup.find('h1', class_='entry-title').text.strip()
     else:
         qs = query_url(self.url)
         tags = qs.get('tags', [])
         tags.sort()
         id = u' '.join(tags)
         if not id:
             id = u'N/A'
         id = '[{}] '.format(self.type_sankaku) + id
     return clean_title(id)
Esempio n. 12
0
def main():
    # path = "./asset/Donald-Trump-vs-Barack-Obama-on-Nuclear-Weapons-in-East-Asia.txt"
    path = "./asset/People-Arent-Upgrading-Smartphones-as-Quickly-and-That-Is-Bad-for-Apple.txt"
    # path = "./asset/The-Last-Man-on-the-Moon--Eugene-Cernan-gives-a-compelling-account.txt"
    path_synsets = "./asset/synsets.txt"
    path_nasari = "./asset/dd-nasari.txt"

    # Lettura del file Synset ottenuto con lo script titleSynset.py
    synsets = utils.read_file_synset(path_synsets)
    # Dizionario di synsets con parola come key e babel synset id come valore
    word_to_synset = utils.word_to_synset_dict(synsets)

    # Lettura del file nasari
    nasari = utils.read_file_nasari(path_nasari)

    # Lettura file da testare
    text = utils.read_file(path)

    # Individuazione di 10 keyword nel file
    keywords = utils.get_key_words(text)
    # print(keywords)

    # Divisione del testo in titolo e paragrafi
    dictionary = utils.paragraph(text)
    # Pulizia del titolo con unione dei nomi propri in unico token ed eliminazione delle stop words
    dictionary = utils.clean_title(dictionary)
    # print(dictionary)

    # Determinazione del contesto
    context = get_context(dictionary["Titolo"], word_to_synset, nasari)
    # print(context)
    # context = []

    # Determinazione dell'importanza/rank dei paragrafi
    rank_p = rank_paragraphs(dictionary, context, keywords)
    rank_p2 = copy.deepcopy(rank_p)

    print("\n\n\nORIGINAL\n\n\n" + utils.generate_summary(rank_p))

    # Creazione riassunti con metodo trivial
    summary = summarize_trivial(
        rank_p2, ratio=0.3
    )  # Il ratio si può cambiare in base alla percentuale di riassunto
    print("\n\n\nSUMMARY TRIVIAL\n\n\n" + utils.generate_summary(summary))

    # Creazione riassunti con metodo efficiente
    summary = summarize(
        rank_p, ratio=0.3
    )  # Il ratio si può cambiare in base alla percentuale di riassunto
    print("\n\n\nSUMMARY\n\n\n" + utils.generate_summary(summary))

    # Salvataggio riassunti
    utils.save_summary(summary)
Esempio n. 13
0
    def read(self):
        imgs = get_imgs(self.url, self.info, self.cw)
        for img in imgs:
            ext = os.path.splitext(img.split('?')[0])[1]
            if len(imgs) > 1:
                self.filenames[img] = (u'{:04}{}').format(len(self.urls), ext)
            else:
                self.filenames[img] = clean_title(self.name, n=-len(ext)) + ext
            self.urls.append(img)

        self.single = len(imgs) == 1
        self.referer = self.url
        self.title = u'{} (imgur_{})'.format(self.name, self.id_)
Esempio n. 14
0
 def name(self):
     id = self.__info.id
     title = self.__info.title
     artist = self.__info.artist
     title = self.format_title('N/A',
                               id,
                               title,
                               artist,
                               'N/A',
                               'N/A',
                               'Korean',
                               prefix='navertoon_')
     return clean_title(title)
Esempio n. 15
0
 def name(self):
     title = self._info['title']
     artists = self._info['artists']
     artist = artists[0] if artists else 'N/A'
     title = self.format_title('N/A',
                               ''.join(get_id(self.url)),
                               title,
                               artist,
                               'N/A',
                               'N/A',
                               'Korean',
                               prefix='daumtoon_')
     return clean_title(title)
Esempio n. 16
0
 def get(self, referer):
     ext = get_ext(self._url)
     name = self.format_.replace('id', '###id*').replace(
         'page',
         '###page*').replace('artist',
                             '###artist*').replace('title', '###title*')
     name = name.replace('###id*', str(self.id_)).replace(
         '###page*',
         str(self.p)).replace('###artist*',
                              self.artist).replace('###title*', self.title)
     self.filename = clean_title(name.strip(), allow_dot=True,
                                 n=-len(ext)) + ext
     return self._url
 def name(self):
     global pss
     if self._name is None:
         url = self.url
         flickr_auth.get_api(url, self.cw)
         if '/albums/' in url:
             user, ps = find_ps(url)
             self._name = u'{} (flickr_album_{}_{})'.format(
                 ps.title, user.id, ps.id)
         else:
             user = flickr_api.Person.findByUrl(url)
             self._name = u'{} (flickr_{})'.format(user.username, user.id)
     return clean_title(self._name)
Esempio n. 18
0
    def get(self, url):
        print_ = get_print(self.cw)
        if self._url:
            return self._url

        info = self.info

        ##        ydl = ytdl.YoutubeDL()
        ##        info = ydl.extract_info(url)

        formats = info['formats']
        print(formats)
        formats = sorted(formats,
                         key=lambda x: int(x.get('abr', 0)),
                         reverse=True)
        url_audio = None

        for format in formats:
            protocol = format['protocol']
            print_(u'【{}】 format【{}】 abr【{}】'.format(protocol,
                                                     format['format'],
                                                     format.get('abr', 0)))
            if not url_audio and protocol in ['http', 'https']:
                url_audio = format['url']

        if not url_audio:
            url_audio = M3u8_stream(formats[0]['url'])
            self.album_art = False  #

        self.username = info['uploader']
        self.title = u'{} - {}'.format(self.username, info['title'])
        self.filename = u'{}{}'.format(
            clean_title(self.title, allow_dot=True, n=-4), '.mp3')

        thumb = None
        for t in info['thumbnails'][::-1]:
            width = t.get('width', 1080)
            if not 100 <= width <= 500:
                continue
            url_thumb = t['url']
            thumb = BytesIO()
            try:
                downloader.download(url_thumb, buffer=thumb)
                break
            except Exception as e:
                print(e)
                thumb = None
        self.thumb = thumb

        self._url = url_audio
        return self._url
Esempio n. 19
0
def get_imgs_page(page, title, referer, session, cw):
    print_ = get_print(cw)
    #sleep(2)
    #html = downloader.read_html(page.url, referer, session=session)
    #soup = Soup(html)

    # 2183
    session, soup, page.url = get_soup(page.url, session)

    title_page = clean_title(
        soup.find('span', class_='page-desc').text.strip())
    if page.title != title_page:
        print_('{} -> {}'.format(page.title, title_page))
        page.title = title_page

    views = soup.findAll('div', class_='view-content')\
            + soup.findAll('div', class_='view-padding')
    if not views:
        raise Exception('no views')

    hash = re.find(r'''data_attribute *: *['"](.+?)['"]''', soup.html)
    print_('hash: {}'.format(hash))
    if hash is None:
        raise Exception('no hash')

    imgs = []
    for view in views:
        if view is None:
            continue
        for img in view.findAll('img'):
            if not isVisible(img):
                continue
            src = img.get('data-{}'.format(hash))
            src = src or img.get(
                'content')  # https://manatoki77.net/comic/5266935
            if not src:
                continue
            img = urljoin(page.url, src)
            if '/img/cang' in img:
                continue
            if '/img/blank.gif' in img:
                continue
            img = Image(img, page, len(imgs))
            imgs.append(img)


##    if not imgs:
##        raise Exception('no imgs')

    return imgs
    def read(self):
        if '/video/' in self.url:
            res = clf2.solve(self.url, session=self.session, cw=self.cw)
            soup = Soup(res['html'])
            title = soup.find('h1', id='post_title').text.strip()
            self.title = title
            view = soup.find('div', id='post')
            video = view.find('video')
            src = video.find('source')['src']
            src = urljoin(self.url, src)
            video = Video(src, self.url, title, self.session)
            self.urls.append(video.url)
            self.single = True
            return
        
        if '/image/' not in self.url:
            raise NotImplementedError('Not a post')

        res = clf2.solve(self.url, session=self.session, cw=self.cw)
        soup = Soup(res['html'])
        title = soup.find('h2').text
        paginator = soup.find('div', id='paginator')
        pages = [self.url]
        for a in paginator.findAll('a'):
            href = a.get('href')
            if not href:
                continue
            href = urljoin(self.url, href)
            if href not in pages:
                pages.append(href)

        imgs = []
        for i, page in enumerate(pages):
            if page == self.url:
                soup_page =  soup
            else:
                soup_page = downloader.read_soup(page, session=self.session)
            view = soup_page.find('div', id='post')
            for img in view.findAll('img'):
                href = img.parent['href']
                href = urljoin(page, href)
                img = Image(href, page, len(imgs), self.session)
                imgs.append(img)
            self.cw.setTitle('{} {} ({} / {})'.format(tr_('읽는 중...'), title, i+1, len(pages)))

        for img in imgs:
            self.urls.append(img.url)

        self.title = clean_title(title)
 def id(self):
     if self.type_sankaku == 'www':
         id = '[www] ' + self.soup.find('h1', class_='entry-title').text.strip()
     else:
         if '/post/show/' in self.url:
             id = get_id(self.url)
         else:
             qs = query_url(self.url)
             tags = qs.get('tags', [])
             tags.sort()
             id = ' '.join(tags)
             if not id:
                 id = 'N/A'
         id = '[{}] {}'.format(self.type_sankaku, id)
     return clean_title(id)
Esempio n. 22
0
    def read(self):
        outdir = get_outdir('kakuyomu')

        self.artist = self.info['artist']
        title_dir = clean_title(u'[{}] {}'.format(self.artist,
                                                  self.info['title']))

        for page in self.info['pages']:
            file = os.path.join(outdir, title_dir, page.filename)
            if os.path.isfile(file):
                self.urls.append(file)
            else:
                self.urls.append(page.file)

        self.title = title_dir
Esempio n. 23
0
def extract_section_paragraphs(paragraph, new_json, level=0):
    """
    Formulates the current section in to a cleaner representation with rectified labels.
    :param paragraph: Current paragraph in the terms of service
    :param new_json: The document where the new structure is stored for later output.
    :param level: Whether it is the first or second level heading.
    :return:
    """
    title = clean_title(paragraph["section"][level], grouped_keys)
    text = clean_text(paragraph["text"])
    if title and text:
        new_json["level" + str(level + 1) + "_headings"].append({"section": title,
                                                                 "text": text})

    return new_json
Esempio n. 24
0
    def __init__(self,
                 filename_base: str,
                 ly_filepath: str,
                 pdf_base: str,
                 toc_entry: str = None):
        self.filename_base = filename_base
        self.ly_filepath = ly_filepath
        self.pdf_base = pdf_base  # pdf path without file extension
        self.pdf_filepath = '{}.pdf'.format(pdf_base)

        if toc_entry:
            self.toc_entry = utils.clean_title(toc_entry)
        else:
            # If we didn't get an explicit ToC entry, just use the name of the pdf :-/
            self.toc_entry = self.pdf_filepath
 def __init__(self, type, url, title, referer, p=0, multi_post=False):
     self.type = type
     self.url = LazyUrl(referer, lambda _: url, self)
     ext = get_ext(url)
     if ext.lower() == '.php':
         ext = '.mp4'
     if type == 'video':
         id_ = re.find('videos/([0-9a-zA-Z_-]+)', referer, err='no video id')
         self.filename = format_filename(title, id_, ext) #4287
     elif type == 'image':
         name = '{}_p{}'.format(clean_title(title), p) if multi_post else p
         self.filename = '{}{}'.format(name, ext)
     else:
         raise NotImplementedError(type)
     self.title = title
    def read(self):
        cw = self.cw
        session = self.session

        videos = []
        tab = ''.join(
            self.url.replace('pornhubpremium.com', 'pornhub.com', 1).split('?')
            [0].split('#')[0].split('pornhub.com/')[-1].split('/')[2:3])

        if '/album/' in self.url:
            self.print_('Album')
            info = read_album(self.url, session=session)
            self.single = False
            for photo in info['photos']:
                self.urls.append(photo.url)

            self.title = clean_title(info['title'])
        elif '/photo/' in self.url:
            self.print_('Photo')
            info = read_photo(self.url, session=session)
            for photo in info['photos']:
                self.urls.append(photo.url)

            self.title = info['title']
        elif tab not in ['', 'videos']:
            raise NotImplementedError(tab)
        elif 'viewkey=' not in self.url.lower() and\
             '/embed/' not in self.url.lower() and\
             '/gif/' not in self.url.lower():
            self.print_('videos')
            info = get_videos(self.url, cw)
            hrefs = info['hrefs']
            self.print_('videos: {}'.format(len(hrefs)))

            if not hrefs:
                raise Exception('no hrefs')

            videos = [Video(href, cw, session) for href in hrefs]
            video = self.process_playlist(info['title'], videos)
            self.setIcon(video.thumb)
            self.enableSegment()
        else:
            video = Video(self.url, cw, session)
            video.url()
            self.urls.append(video.url)
            self.setIcon(video.thumb)
            self.title = video.title
            self.enableSegment()
Esempio n. 27
0
def main():
    docs = {}
    batch = 1
    for fname in glob.glob('text/*/wiki*', recursive=True):
        print(fname)
        with open(fname) as f:
            in_doc = False
            cur_doc = {}
            cur_lines = []
            for line in f:
                if not in_doc:
                    if line.startswith('<doc id="'):
                        in_doc = True
                        doc_id, title = extract_title_id(line)
                        cur_doc['id'] = doc_id
                        cur_doc['title'] = clean_title(title)
                    continue

                if line.startswith('</doc>'):
                    doc_id = cur_doc['id']
                    del cur_doc['id']

                    text = ''.join(cur_lines)
                    cats = RE_CAT.findall(text)
                    cats = [c.split('|')[0].strip() for _, c, _ in cats]
                    if cats:
                        cur_doc['cats'] = cats
                    is_disam = any(disam in text for disam in DISAMS)
                    if is_disam:
                        cur_doc['dis'] = 1

                    docs[doc_id] = cur_doc

                    in_doc = False
                    cur_doc = {}
                    cur_lines = []

                else:
                    cur_lines.append(line)

        if len(docs) >= 100000:
            dump_to_json(docs, 'expanded/expanded_{}.json'.format(batch))
            docs = {}
            batch += 1

    if docs:
        dump_to_json(docs, 'expanded/expanded_{}.json'.format(batch))
        docs = {}
def parse_synopsis_doc(doc, db):
    parsed_doc = dict()
    parsed_doc['title'] = clean_title(doc['Title'])
    parsed_doc['link'] = doc['Link']
    parsed_doc['synopsis_link'] = doc['Synopsis_Link']
    parsed_doc['origin'] = "Scraper_public_health_ontario"
    parsed_doc['journal_string'] = doc['Journal_String'].strip(' \t\r.')
    parsed_doc['authors'] = doc["Authors"]
    parsed_doc['abstract'] = find_abstract(doc.get('Abstract'))

    paper_fs = gridfs.GridFS(db, collection='Scraper_publichealthontario_fs')
    pdf_file = paper_fs.get(doc['PDF_gridfs_id'])

    # with open('example.pdf', 'wb') as f:
    #     f.write(pdf_file.read())
    #     pdf_file.seek(0)

    try:
        paragraphs = extract_paragraphs_pdf(BytesIO(pdf_file.read()),
                                            return_dicts=True,
                                            only_printable=True)
    except Exception as e:
        print('Failed to extract PDF %s(%r) (%r)' %
              (doc['Doi'], doc['PDF_gridfs_id'], e))
        traceback.print_exc()
        paragraphs = []

    sections = {}
    last_sec = None
    for p in paragraphs:
        is_heading = 18 < p['bbox'][3] - p['bbox'][1] and p['bbox'][2] - p[
            'bbox'][0] < 230
        if is_heading:
            last_sec = p['text'].lower()
            sections[last_sec] = []
        elif last_sec is not None:
            sections[last_sec].append(p)

    parsed_doc['synopsis'] = {
        'summary': sections.get('one-minute summary', None),
        'additional_info': sections.get('additional information', None),
        'pho_reviewer_comments': sections.get('pho reviewers comments', None),
    }
    if all(x is None for x in parsed_doc['synopsis'].values()):
        parsed_doc['synopsis'] = None

    return parsed_doc
def convert_biorxiv_to_vespa(doc, db):
    # paper_fs = gridfs.GridFS(
    #     db, collection='Scraper_connect_biorxiv_org_fs')
    # pdf_file = paper_fs.get(doc['PDF_gridfs_id'])

    # parsed_content = try_parse_pdf_hierarchy(pdf_file)
    parsed_content = {}

    parsed_doc = {
        'title': clean_title(doc['Title']),
        '_id': doc['_id'],
        'source': doc['Journal'],
        'license': doc['Journal'],
        'datestring': doc['Publication_Date'].strftime('%Y-%m-%d'),
        'doi': doc['Doi'],
        'url': doc['Link'],
        'cord_uid': None,
        'authors': [],
        'bib_entries': None,
        'abstract': ' '.join(doc['Abstract']),
        'journal': doc['Journal'],
        'body_text': parsed_content.get('body', None),
        'conclusion': parsed_content.get('conclusion', None),
        'introduction': parsed_content.get('introduction', None),
        'results': parsed_content.get('result', None),
        'discussion': parsed_content.get('discussion', None),
        'methods': parsed_content.get('method', None),
        'background': parsed_content.get('background', None),
        'timestamp': int(doc['Publication_Date'].timestamp()),
        'pmcid': None,
        'pubmed_id': None,
        'who_covidence': None,
        'has_full_text': len(parsed_content.get('body', '')) > 0,
        'dataset_version': datetime.now().timestamp(),
    }

    for person in doc["Authors"]:
        parsed_doc['authors'].append({
            'first':
            person['Name']['fn'],
            'last':
            person['Name']['ln'],
            'name':
            f'{person["Name"]["fn"]} {person["Name"]["ln"]}'
        })

    return parsed_doc
 def name(self):
     if self._name is None:
         parsed_url = urlparse(self.url)
         qs = parse_qs(parsed_url.query)
         if 'donmai.us/favorites' in self.url:
             id = qs.get('user_id', [''])[0]
             print('len(id) =', len(id), u'"{}"'.format(id))
             assert len(id) > 0, '[Fav] User id is not specified'
             id = u'fav_{}'.format(id)
         else:
             tags = qs.get('tags', [])
             tags.sort()
             id = u' '.join(tags)
         if not id:
             id = u'N/A'
         self._name = id
     return clean_title(self._name)
Esempio n. 31
0
    def read(self):
        checkLogin(self.session)

        uid, oid, name = get_id(self.url, self.cw)
        title = clean_title('{} (weibo_{})'.format(name, uid))

        for img in get_imgs(uid,
                            oid,
                            title,
                            self.session,
                            cw=self.cw,
                            d=self,
                            parent=self.mainWindow):
            self.urls.append(img.url)
            self.filenames[img.url] = img.filename

        self.title = title
Esempio n. 32
0
def main(data_dir):
    redirects = {}
    batch = 1
    for fname in glob.glob(data_dir + '/*/wiki*', recursive=False):
        print(fname)
        with open(fname) as f:
            in_doc = False
            cur_doc = {}
            cur_lines = []
            for line in f:
                if not in_doc:
                    if line.startswith('<doc id="'):
                        in_doc = True
                        doc_id, title = extract_title_id(line)
                        cur_doc['id'] = doc_id
                        cur_doc['title'] = clean_title(title)
                    continue

                if line.startswith('</doc>'):
                    doc_id = cur_doc['id']
                    del cur_doc['id']

                    for cur_line in cur_lines:
                        m = RE_REDIRECT.search(cur_line)
                        if m:
                            cur_doc['redirect'] = m.group(1)
                            break

                    if 'redirect' in cur_doc:
                        redirects[doc_id] = cur_doc

                    in_doc = False
                    cur_doc = {}
                    cur_lines = []

                else:
                    cur_lines.append(line)

        if len(redirects) >= 100000:
            dump_to_json(redirects, 'expanded/expanded_{}.json'.format(batch))
            redirects = {}
            batch += 1

    if redirects:
        dump_to_json(redirects, 'expanded/expanded_{}.json'.format(batch))
        redirects = {}
    def read(self):
        file = None
        files = None
        title = None
        if '/users/' in self.url or '/user/' in self.url:
            type_ = 'videos'
            try:
                if self.url.split('/users/')[1].split('/')[1] == 'images':
                    type_ = 'images'
            except:
                pass
            info = read_channel(self.url, type_, self.session, self.cw)
            title = info['title']
            urls = info['urls']
            if type_ == 'videos':
                files = [LazyFile(url, type_, self.session) for url in urls]
                file = self.process_playlist('[Channel] [{}] {}'.format(type_.capitalize(), title), files)
            elif type_ == 'images': #4499
                files = []
                for i, url in enumerate(urls):
                    check_alive(self.cw)
                    files += get_files(url, self.session, multi_post=True, cw=self.cw) #4728
                    self.title = '{} {} - {} / {}'.format(tr_('읽는 중...'), title, i, len(urls))
                title = '[Channel] [{}] {}'.format(type_.capitalize(), title)
            else:
                raise NotImplementedError(type_)

        if file is None:
            if files is None:
                files = get_files(self.url, self.session, cw=self.cw)
            for file in files:
                self.urls.append(file.url)
            file = files[0]

            if file.type == 'youtube':
                raise errors.Invalid('[iwara] Youtube: {}'.format(self.url))
            
            if file.type == 'image':
                self.single = False
            title = title or file.title
            if not self.single:
                title = clean_title(title)
            self.title = title
            
        if file.thumb is not None:
            self.setIcon(file.thumb)