Beispiel #1
0
            # Cargo datos
            data = pickle.load(open(this_file, 'rb'))

            progress_bar = ChargingBar("Evaluating file {} - {}/{}".format(
                this_filename, this_file_idx + 1, len(test_filepaths)),
                                       suffix="%(percent)d%%")

            if realistic_check:
                # Separo datos en pedazos
                sign = [
                    data['data'][i:i + slice_samples]
                    for i in range(0, len(data['data']), slice_samples)
                ]

                progress_bar.max = len(sign)
                progress_bar.start()

                for this_slice_idx, this_slice in enumerate(sign):
                    """
                    if this_slice_idx % (round(len(sign) / 20)) == 0:
                        print("{}/{} slices evaluated.".format(this_slice_idx, len(sign)))
                    """

                    progress_bar.next()

                    # Si no pongo este if, el ultimo pedazo podria ser de distinto tamaño y fallar
                    if len(this_slice) == slice_samples:
                        # Analisis
                        this_data = torch.Tensor(this_slice)
                        this_data = this_data.view(1, 1,
Beispiel #2
0
    def scrape(self, term):
        """
        Scrapes metadata of PubMed articles returned by search term query, processes
        abstracts, and stores relevant articles

        :param term: PubMed term query
        """
        print(
            f'Collection: {self._collection.database.name}.{self._collection.name}. Database: PubMed. Term: {term}.'
        )

        base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils'
        retmax = 10000
        no_id = 0
        unreadable = 0
        abstracts = []
        articles = []
        page = 0
        total = retmax

        # progress bar
        bar = ChargingBar('Getting metadata:',
                          max=total,
                          suffix='%(index)d of %(max)d - %(elapsed_td)s')

        while page < total:
            # gets and stores to history UIDs of query
            url = f'{base}/esearch.fcgi?db=pubmed&term={term}&retstart={page}'
            url += f'&retmax={retmax}&usehistory=y&api_key={PUBMED_API_KEY}'
            response = requests.get(url)

            if not response.ok:
                print(
                    f'\nPubmedScraper could not get UIDs for \'{term}\' on page {page}.'
                )
                continue

            # gets info for retrieving UIDs from history
            soup = BeautifulSoup(response.content, 'html.parser')
            web = soup.webenv.string
            key = soup.querykey.string
            total = int(soup.count.string)
            bar.max = total

            # gets metadata for articles from UIDs
            url = f'{base}/efetch.fcgi?db=pubmed&WebEnv={web}'
            url += f'&query_key={key}&retstart={page}&retmax={retmax}'
            url += f'&retmode=xml&api_key={PUBMED_API_KEY}'
            response = requests.get(url)

            if not response.ok:
                print(
                    f'\nPubmedScraper could not get metadata for \'{term}\' on page {page}.'
                )
                continue

            soup = BeautifulSoup(response.content, 'html.parser')
            entries = soup.find_all('pubmedarticle')

            for article in entries:
                # ignore abstract if doi and uid are null
                doi = self._get_string(
                    article.find('elocationid', eidtype='doi'))
                uid = self._get_string(article.find('pmid'))
                if not doi and not uid:
                    no_id += 1
                    bar.next()
                    continue

                # store abstract text for use by mat2vec below
                abstract = self._remove_html(article.abstracttext)

                # continues if paper does not have abstract
                if not abstract:
                    unreadable += 1
                    bar.next()
                    continue

                # segments abstract by sentence
                doc = self.nlp(abstract)
                sentences = []
                is_unreadable = False

                for sent in doc.sents:
                    # processes sentence text using processor from mat2vec
                    try:
                        tokens, materials = self.processor.process(sent.text)
                    except OverflowError:
                        is_unreadable = True
                        break

                    processed_sent = ' '.join(tokens)
                    sentences.append(processed_sent)

                # if processor (from above) throws an error, skip the paper
                if is_unreadable:
                    bar.next()
                    unreadable += 1
                    continue

                processed_abstract = '\n'.join(sentences)

                article = {
                    'doi':
                    doi,
                    'uid':
                    uid,
                    'title':
                    self._remove_html(article.articletitle),
                    'abstract':
                    abstract,
                    'creators':
                    self._get_authors(article.find_all('author')),
                    'publication_name':
                    self._remove_html(article.journal.title),
                    'issn':
                    self._get_string(article.find('issn', issntype='Print')),
                    'eissn':
                    self._get_string(
                        article.find('issn', issntype='Electronic')),
                    'publication_date':
                    self._get_date(article.articledate),
                    'database':
                    'pubmed',
                    'processed_abstract':
                    processed_abstract,
                }
                articles.append(article)
                abstracts.append(processed_abstract)
                bar.next()

                # classify abstracts if 20000 have been stored
                if len(abstracts) == 20000:
                    self._store(articles, abstracts)
                    articles = []
                    abstracts = []
            page += retmax
        bar.finish()

        # unreadable papers
        print(f'No DOI/UID: {no_id}')
        print(f'Unreadable papers: {unreadable}')

        # classifies and stores metadata
        if abstracts:
            self._store(articles, abstracts)
            print()
        else:
            print('No abstracts to classify.\n')
            return

        # prints classifier metrics
        for classifier in self._classifiers:
            classifier.print_metrics()
            classifier.reset_metrics()

        # prints general tag metrics
        if self._save:
            print(f'Total articles analyzed: {self._gen_total}.')
            print(
                f'Stored {self._gen_new} new abstracts to \'{self._gen_tag}\'.'
            )
            print()
            self._gen_new = 0
            self._gen_total = 0
Beispiel #3
0
    def scrape_faster(self, query):
        """
        Note: requires institutional access by VPN, or else an error will be thrown
        Faster implementation of scrape()

        Scrapes metadata of Elsevier (ScienceDirect) articles returned
        by query, processes abstracts, and stores relevant articles

        :param query: Elsevier database query
        """
        print(
            f'Collection: {self._collection.database.name}.{self._collection.name}. Database: Elsevier. Query: {query}.'
        )

        # create url
        url = f'https://api.elsevier.com/content/metadata/article?query=KEY({query})&apiKey={ELSEVIER_API_KEY}&httpAccept=application%2Fjson'

        articles = []
        abstracts = []
        no_doi = 0
        unreadable = 0
        item = 0
        total = 5000

        # progress bar
        bar = ChargingBar('Getting metadata:',
                          max=total,
                          suffix='%(index)d of %(max)d - %(elapsed_td)s')

        while item < total:
            response = requests.get(url)

            if response.ok:
                data = json.loads(response.content)['search-results']
                records = data['entry']

                # updates total to total number of papers in query
                if item == 0:
                    total = min(5000, int(data['opensearch:totalResults']))
                    bar.max = total

                    # if there are no results, exit
                    if total == 0:
                        print('Search returned no results.\n')
                        return

                for record in records:
                    doi = record.get('prism:doi')
                    if not doi:
                        no_doi += 1
                        bar.next()
                        continue

                    abstract = record.get('prism:teaser')

                    # if there is no abstract, skip this article
                    if not abstract:
                        unreadable += 1
                        bar.next()
                        continue

                    # segments abstract by sentence
                    doc = self.nlp(abstract)
                    sentences = []
                    is_unreadable = False

                    # processes sentence text using mat2vec processor
                    for sent in doc.sents:
                        try:
                            tokens, materials = self.processor.process(
                                sent.text)
                        except OverflowError:
                            is_unreadable = True
                            break

                        processed_sent = ' '.join(tokens)
                        sentences.append(processed_sent)

                    # if processor (from above) throws an error, skip the paper
                    if is_unreadable:
                        unreadable += 1
                        bar.next()
                        continue

                    processed_abstract = '\n'.join(sentences)

                    # create new document and store new article document if not in collection
                    article = {
                        'doi':
                        doi,
                        'uid':
                        None,
                        'title':
                        record.get('dc:title'),
                        'abstract':
                        abstract,
                        'url':
                        record.get('prism:url'),
                        'creators':
                        self._get_creators(data.get('dc:creator')),
                        'publication_name':
                        data.get('prism:publicationName'),
                        'issn':
                        record.get('prism:issn'),
                        'publication_date':
                        self._get_date(data.get('prism:coverDate')),
                        'database':
                        'elsevier',
                        'processed_abstract':
                        processed_abstract
                    }
                    articles.append(article)
                    abstracts.append(processed_abstract)
                    bar.next()

                # sets url to next page in search
                url = data['link'][-2]['@href']

            # json file has 25 items per page, so go to the next page
            item += 25
        bar.finish()

        # unreadable papers
        print(f'Unreadable papers: {unreadable}')

        # classifies and stores metadata
        if abstracts:
            self._store(articles, abstracts)
            print()
        else:
            print('No abstracts to classify.\n')
            return

        # prints classifier metrics
        for classifier in self._classifiers:
            classifier.print_metrics()
            classifier.reset_metrics()

        # prints general tag metrics
        if self._save:
            print(f'Total articles analyzed: {self._gen_total}.')
            print(
                f'Stored {self._gen_new} new abstracts to \'{self._gen_tag}\'.'
            )
            print()
            self._gen_new = 0
            self._gen_total = 0
Beispiel #4
0
    def scrape(self, subject='', keyword=''):
        """
        Scrapes metadata of Springer Nature articles returned by subject and
        keyword query, processes abstracts, and stores relevant articles

        :param subject: subject constraint query, if empty does not include subject
        constraint to query
        :param keyword: keyword constraint query, if empty does not include keyword
        constraint to query
        """
        # prints subject and query made
        subject_print = subject if subject else 'None'
        keyword_print = keyword if keyword else 'None'
        print(
            f'Collection: {self._collection.database.name}.{self._collection.name}. Database: Springer Nature. Subject: {subject_print}, Keyword: {keyword_print}.'
        )

        articles = []
        abstracts = []
        unreadable = 0
        no_doi = 0
        item = 0
        total = 100

        # progress bar
        bar = ChargingBar('Getting metadata:',
                          max=total,
                          suffix='%(index)d of %(max)d - %(elapsed_td)s')

        while item < total:
            # builds url and queries API
            url = self._url_builder(item, subject, keyword)
            response = requests.get(url)

            if response.ok:
                data = json.loads(response.content)
                records = data['records']

                # updates total to total number of papers in query
                if item == 0:
                    total = int(data['result'][0]['total'])
                    bar.max = total

                # gets metadata
                for record in records:
                    # ignore abstract if doi is null
                    doi = record.get('doi')
                    if not doi:
                        no_doi += 1
                        bar.next()
                        continue

                    # store abstract text for use by mat2vec below
                    abstract = record.get('abstract')

                    # continues if paper does not have abstract
                    if not abstract:
                        unreadable += 1
                        bar.next()
                        continue

                    # segments abstract by sentence
                    doc = self.nlp(abstract)
                    sentences = []
                    is_unreadable = False

                    # processes sentence text using mat2vec processor
                    for sent in doc.sents:
                        try:
                            tokens, materials = self.processor.process(
                                sent.text)
                        except OverflowError:
                            is_unreadable = True
                            break

                        processed_sent = ' '.join(tokens)
                        sentences.append(processed_sent)

                    # if processor (from above) throws an error, skip the paper
                    if is_unreadable:
                        unreadable += 1
                        bar.next()
                        continue

                    processed_abstract = '\n'.join(sentences)

                    # create new document and store new article document if not in collection
                    article = {
                        'doi':
                        record.get('doi'),
                        'uid':
                        None,
                        'title':
                        record.get('title'),
                        'abstract':
                        abstract,
                        'url':
                        self._get_url(record.get('url')),
                        'creators':
                        self._get_creators(record.get('creators')),
                        'publication_name':
                        record.get('publicationName'),
                        'issn':
                        record.get('issn'),
                        'eissn':
                        record.get('eIssn'),
                        'publication_date':
                        self._get_date(record.get('publicationDate')),
                        'database':
                        'springer',
                        'processed_abstract':
                        processed_abstract
                    }
                    articles.append(article)
                    abstracts.append(processed_abstract)
                    bar.next()

                    # classify abstracts if 20000 have been stored
                    if len(abstracts) == 20000:
                        self._store(articles, abstracts)
                        articles = []
                        abstracts = []

            # 100 items per page, so go to next page
            item += 100
        bar.finish()

        # unreadable papers
        print(f'No DOI: {no_doi}')
        print(f'Unreadable papers: {unreadable}')

        # classifies and stores metadata
        if abstracts:
            self._store(articles, abstracts)
            print()
        else:
            print('No abstracts to classify.\n')
            return

        # prints classifier metrics
        for classifier in self._classifiers:
            classifier.print_metrics()
            classifier.reset_metrics()

        # prints general tag metrics
        if self._save:
            print(f'Total articles analyzed: {self._gen_total}.')
            print(
                f'Stored {self._gen_new} new abstracts to \'{self._gen_tag}\'.'
            )
            print()
            self._gen_new = 0
            self._gen_total = 0
Beispiel #5
0
    def scrape(self, query):
        """
        Scrapes metadata of Elsevier (ScienceDirect) articles returned
        by query, processes abstracts, and stores relevant articles

        :param query: Elsevier database query
        """
        print(
            f'Collection: {self._collection.database.name}.{self._collection.name}. Database: Elsevier. Query: {query}.'
        )

        # creates search url
        url = f'https://api.elsevier.com/content/search/sciencedirect?query={query}&apiKey={ELSEVIER_API_KEY}&httpAccept=application%2Fjson'

        # gets dois
        dois = []
        item = 0
        total = 5000

        # progress bar
        bar = ChargingBar('Getting DOIs:',
                          max=total,
                          suffix='%(index)d of %(max)d - %(elapsed_td)s')

        while item < total:
            response = requests.get(url)

            if response.ok:
                data = json.loads(response.content)['search-results']

                # updates total to total number of papers in query
                if item == 0:
                    total = min(5000, int(data['opensearch:totalResults']))
                    bar.max = total

                # stores dois
                for entry in data['entry']:
                    doi = entry.get('prism:doi')
                    if doi:
                        dois.append(current_doi)
                    bar.next()

                # sets url to next page in search
                url = data['link'][-2]['@href']

                # json file has 25 items per page, so go to the next page
                item += 25
        bar.finish()

        # metadata
        articles = []
        abstracts = []
        unreadable = 0

        if not dois:
            print('No abstracts to classify.\n')
            return

        # progress bar
        bar = ChargingBar('Getting metadata:',
                          max=len(dois),
                          suffix='%(index)d of %(max)d - %(elapsed_td)s')

        for doi in dois:
            url = f'https://api.elsevier.com/content/article/doi/{doi}?apiKey={ELSEVIER_API_KEY}&httpAccept=application%2Fjson'
            response = requests.get(url)

            if response.ok:
                try:
                    data = json.loads(
                        response.content
                    )['full-text-retrieval-response']['coredata']
                except json.decoder.JSONDecodeError:
                    unreadable += 1
                    bar.next()
                    continue

                # store abstract text for use by mat2vec below
                abstract = data.get('dc:description')

                # continues if paper does not have abstract
                if not abstract:
                    unreadable += 1
                    bar.next()
                    continue

                # segments abstract by sentence
                doc = self.nlp(abstract)
                sentences = []
                is_unreadable = False

                # processes sentence text using processor from mat2vec
                for sent in doc.sents:
                    try:
                        tokens, materials = self.processor.process(sent.text)
                    except OverflowError:
                        is_unreadable = True
                        break

                    processed_sent = ' '.join(tokens)
                    sentences.append(processed_sent)

                # if processor (from above) throws an error, skip the paper
                if is_unreadable:
                    bar.next()
                    unreadable += 1
                    continue

                processed_abstract = '\n'.join(sentences)

                article = {
                    'doi': doi,
                    'uid': None,
                    'title': data.get('dc:title'),
                    'abstract': abstract,
                    'url': data.get('prism:url'),
                    'creators': self._get_creators(data.get('dc:creator')),
                    'publication_name': data.get('prism:publicationName'),
                    'issn': data.get('prism:issn'),
                    'publication_date':
                    self._get_date(data.get('prism:coverDate')),
                    'database': 'elsevier',
                    'processed_abstract': processed_abstract,
                }
                articles.append(article)
                abstracts.append(processed_abstract)
            bar.next()
        bar.finish()

        # unreadable papers
        print(f'Unreadable papers: {unreadable}')

        # classifies and stores metadata
        if abstracts:
            self._store(articles, abstracts)
            print()
        else:
            print('No abstracts to classify.\n')
            return

        # prints classifier metrics
        for classifier in self._classifiers:
            classifier.print_metrics()
            classifier.reset_metrics()

        # prints general tag metrics
        if self._save:
            print(f'Total articles analyzed: {self._gen_total}.')
            print(
                f'Stored {self._gen_new} new abstracts to \'{self._gen_tag}\'.'
            )
            print()
            self._gen_new = 0
            self._gen_total = 0