Esempio n. 1
0
def refine(search_path: str,
           categories: Optional[dict] = None,
           highlights: Optional[list] = None,
           show_abstract: Optional[bool] = False,
           show_extra_info: Optional[bool] = False,
           only_selected_papers: Optional[bool] = False,
           only_removed_papers: Optional[bool] = False,
           read_only: Optional[bool] = False,
           verbose: Optional[bool] = False):
    """
    When you have a search result and wanna refine it, this is the method that you'll need to call.
    This method will iterate through all the papers showing their collected data, 
    then asking if you wanna select a particular paper or not, and assign a category if a list of categories is provided.
    And to help you on the refinement, this method can also highlight some terms on the paper's abstract by a provided list of them 

    Parameters
    ----------
    search_path : str
        valid file path containing a JSON representation of the search results
    categories : dict, optional
        A dict with lists of categories by their facets, used to assign to selected papers, by default None
        E.g.:
            {
                'Research Type': [
                    'Validation Research', 'Evaluation Research', 'Solution Proposal', 'Philosophical', 'Opinion', 'Experience'
                ],
                'Contribution': [
                    'Metric', 'Tool', 'Model', 'Method'
                ]
            }
    highlights : list, optional
        A list of terms to highlight on the paper's abstract', by default None
    show_abstract : bool, optional
        A flag to indicate if the abstract should be shown or not, by default False
    show_extra_info : bool, optional
        A flag to indicate if the paper's extra info should be shown or not, by default False
    only_selected_papers : bool, False by default
        If only the selected papers will be refined, by default False
    only_removed_papers : bool, False by default
        If only the removed papers will be refined, by default False
    read_only : bool, optional
        If true, this method will only list the papers, by default False
    verbose : Optional[bool], optional
        If you wanna a verbose logging
    """

    common_util.logging_initialize(verbose)
    common_util.check_write_access(search_path)

    init(autoreset=True)  # colorama initializer

    if categories is None:
        categories = {}
    if highlights is None:
        highlights = []

    search = persistence_util.load(search_path)

    has_already_refined_papers = False
    for paper in search.papers:
        if paper.selected is not None:
            has_already_refined_papers = True
            break

    todo_papers = []
    done_papers = []

    for paper in search.papers:
        #if wanna_re_refine_papers:
        if (only_selected_papers or only_removed_papers):
            if paper.selected is not None and (
                (only_selected_papers and paper.selected) or
                (only_removed_papers and not paper.selected)):
                todo_papers.append(paper)
        else:
            if paper.selected is None or read_only:
                todo_papers.append(paper)
            else:
                done_papers.append(paper)

    todo_papers = sorted(todo_papers,
                         key=lambda x: x.publication_date,
                         reverse=True)

    for i, paper in enumerate(todo_papers):

        print(f'\n{"." * os.get_terminal_size()[0]}\n')

        if not read_only:
            print(f'\n{Fore.CYAN}{i+1}/{len(todo_papers)} papers\n')

        _print_paper_details(paper, highlights, show_abstract, show_extra_info)

        if not read_only:

            answer = _get_select_question_input()

            if answer == 'Skip':
                continue
            elif answer == 'No':
                paper.selected = False
            elif answer == 'Yes':
                paper.selected = True
            else:
                break

            if paper.selected:
                paper.categories = _get_category_question_input(categories)

            done_papers.append(paper)

    if read_only:
        print(f'\n{Fore.CYAN}{len(todo_papers)} papers\n')
    else:
        persistence_util.save(search, search_path)
Esempio n. 2
0
def bibtex(
        filepath: str = typer.Argument(
            ..., help='A valid file path for the search result file'),
        outputpath: str = typer.
    Argument(
        ...,
        help='A valid directory path where the generated bibtex will be placed'
    ),
        only_selected_papers: bool = typer.
    Option(
        False,
        "-s",
        "--selected",
        show_default=True,
        help=
        "A flag to indicate if only selected papers (selections be done on refine command) will be used for bibtex generation"
    ),
        categories: List[str] = typer.
    Option(
        [],
        "-c",
        "--categories",
        show_default=True,
        help=
        "A comma-separated list of categories (categorization can be done on refine command) that will be used to filter which papers will be used for bibtex generation, using the following pattern: <facet>:<term_b>,<term_c>,..."
    ),
        add_findpapers_citation: bool = typer.
    Option(
        False,
        "-f",
        "--findpapers",
        show_default=True,
        help=
        "A flag to indicate if you want to add an entry for Findpapers in your BibTeX output file"
    ),
        verbose: bool = typer.Option(
            False,
            "-v",
            "--verbose",
            show_default=True,
            help="If you wanna a verbose mode logging")):
    """
    Generate a BibTeX file from the search results.

    You can generate the bibtex only for the selected papers by using the -s (or --selected) flag

    You can filter which kind of categorized papers will be used for bibtex generation providing 
    a comma-separated list of categories is provided by the -c (or --categories) argument, 
    You need to define these categories following the pattern: <facet>:<term_b>,<term_c>,...

    E.g.: 
    --categories "Contribution:Metric,Tool"
    
    The -c parameter can be defined several times, so you can define as many filters as you want.
    The -c parameter is case-sensitive.

    You can control the command logging verbosity by the -v (or --verbose) argument.

    """

    try:
        common_util.logging_initialize(verbose)

        categories_by_facet = {} if len(categories) > 0 else None
        for categories_string in categories:
            string_split = categories_string.split(':')
            facet = string_split[0].strip()
            categories_by_facet[facet] = [
                x.strip() for x in string_split[1].split(',')
            ]

        findpapers.generate_bibtex(filepath, outputpath, only_selected_papers,
                                   categories_by_facet,
                                   add_findpapers_citation)
    except Exception as e:
        if verbose:
            logging.debug(e, exc_info=True)
        else:
            typer.echo(e)
        raise typer.Exit(code=1)
def search(outputpath: str, query: Optional[str] = None, since: Optional[datetime.date] = None, until: Optional[datetime.date] = None,
        limit: Optional[int] = None, limit_per_database: Optional[int] = None, databases: Optional[List[str]] = None,
        publication_types: Optional[List[str]] = None, scopus_api_token: Optional[str] = None, ieee_api_token: Optional[str] = None,
        proxy: Optional[str] = None, verbose: Optional[bool] = False):
    """
    When you have a query and needs to get papers using it, this is the method that you'll need to call.
    This method will find papers from some databases based on the provided query.

    Parameters
    ----------
    outputpath : str
        A valid file path where the search result file will be placed

    query : str, optional

        A query string that will be used to perform the papers search.
        
        If not provided, the query will be loaded from the environment variable FINDPAPERS_QUERY

        All the query terms need to be enclosed in quotes and can be associated using boolean operators,
        and grouped using parentheses. 
        E.g.: [term A] AND ([term B] OR [term C]) AND NOT [term D]

        You can use some wildcards in the query too. Use ? to replace a single character or * to replace any number of characters. 
        E.g.: "son?" -> will match song, sons, ...
        E.g.: "son*" -> will match song, sons, sonar, songwriting, ...

        Note: All boolean operators needs to be uppercased. The boolean operator "NOT" must be preceded by an "AND" operator.

    since : Optional[datetime.date], optional
        A lower bound (inclusive) date that will be used to filter the search results, by default None

    until : Optional[datetime.date], optional
        A upper bound (inclusive) date that will be used to filter the search results, by default None

    limit : Optional[int], optional
        The max number of papers to collect, by default None

    limit_per_database : Optional[int], optional
        The max number of papers to collect per each database, by default None

    databases : List[str], optional
        List of databases where the search should be performed, if not specified all databases will be used, by default None

    publication_types : List[str], optional
        List of publication list of publication types to filter when searching, if not specified all the publication types 
        will be collected (this parameter is case insensitive). The available publication types are: journal, conference proceedings, book, other, by default None

    scopus_api_token : Optional[str], optional
        A API token used to fetch data from Scopus database. If you don't have one go to https://dev.elsevier.com and get it, by default None

    ieee_api_token : Optional[str], optional
        A API token used to fetch data from IEEE database. If you don't have one go to https://developer.ieee.org and get it, by default None
    
    proxy : Optional[str], optional
        proxy URL that can be used during requests. This can be also defined by an environment variable FINDPAPERS_PROXY. By default None

    verbose : Optional[bool], optional
        If you wanna a verbose logging
    """

    common_util.logging_initialize(verbose)

    if proxy is not None:
        os.environ['FINDPAPERS_PROXY'] = proxy
    
    logging.info('Let\'s find some papers, this process may take a while...')

    if databases is not None:
        databases = [x.lower() for x in databases]
    
    if publication_types is not None:
        publication_types = [x.lower().strip() for x in publication_types]
        for publication_type in publication_types:
            if publication_type not in ['journal', 'conference proceedings', 'book', 'other']:
                raise ValueError(f'Invalid publication type: {publication_type}')

    if query is None:
        query = os.getenv('FINDPAPERS_QUERY')

    if query is not None:
        query = _sanitize_query(query)

    if query is None or not _is_query_ok(query):
        raise ValueError('Invalid query format')

    common_util.check_write_access(outputpath)

    if ieee_api_token is None:
        ieee_api_token = os.getenv('FINDPAPERS_IEEE_API_TOKEN')

    if scopus_api_token is None:
        scopus_api_token = os.getenv('FINDPAPERS_SCOPUS_API_TOKEN')

    search = Search(query, since, until, limit, limit_per_database, databases=databases, publication_types=publication_types)

    if databases is None or arxiv_searcher.DATABASE_LABEL.lower() in databases:
        _database_safe_run(lambda: arxiv_searcher.run(search),
                        search, arxiv_searcher.DATABASE_LABEL)
    
    if databases is None or pubmed_searcher.DATABASE_LABEL.lower() in databases:
        _database_safe_run(lambda: pubmed_searcher.run(search),
                        search, pubmed_searcher.DATABASE_LABEL)

    if databases is None or acm_searcher.DATABASE_LABEL.lower() in databases:
        _database_safe_run(lambda: acm_searcher.run(search),
                        search, acm_searcher.DATABASE_LABEL)

    if ieee_api_token is not None:
        if databases is None or ieee_searcher.DATABASE_LABEL.lower() in databases:
            _database_safe_run(lambda: ieee_searcher.run(
                search, ieee_api_token), search, ieee_searcher.DATABASE_LABEL)
    else:
        logging.info('IEEE API token not found, skipping search on this database')

    if scopus_api_token is not None:
        if databases is None or scopus_searcher.DATABASE_LABEL.lower() in databases:
            _database_safe_run(lambda: scopus_searcher.run(
                search, scopus_api_token), search, scopus_searcher.DATABASE_LABEL)
    else:
        logging.info('Scopus API token not found, skipping search on this database')

    if databases is None or medrxiv_searcher.DATABASE_LABEL.lower() in databases:
        _database_safe_run(lambda: medrxiv_searcher.run(search),
                        search, medrxiv_searcher.DATABASE_LABEL)

    if databases is None or biorxiv_searcher.DATABASE_LABEL.lower() in databases:
        _database_safe_run(lambda: biorxiv_searcher.run(search),
                        search, biorxiv_searcher.DATABASE_LABEL)

    logging.info('Enriching results...')

    _enrich(search, scopus_api_token)

    logging.info('Filtering results...')

    _filter(search)

    logging.info('Finding and merging duplications...')

    search.merge_duplications()

    logging.info('Flagging potentially predatory publications...')

    _flag_potentially_predatory_publications(search)

    logging.info(f'It\'s finally over! {len(search.papers)} papers retrieved. Good luck with your research :)')

    persistence_util.save(search, outputpath)
Esempio n. 4
0
def download(
        filepath: str = typer.Argument(
            ..., help='A valid file path for the search result file'),
        outputpath: str = typer.
    Argument(
        ...,
        help='A valid directory path where the downloaded papers will be placed'
    ),
        only_selected_papers: bool = typer.
    Option(
        False,
        "-s",
        "--selected",
        show_default=True,
        help=
        "A flag to indicate if only selected papers (selections can be done on refine command) will be downloaded"
    ),
        categories: List[str] = typer.
    Option(
        [],
        "-c",
        "--categories",
        show_default=True,
        help=
        "A comma-separated list of categories (categorization can be done on refine command) that will be used to filter which papers will be downloaded, using the following pattern: <facet>:<term_b>,<term_c>,..."
    ),
        proxy: str = typer.Option(
            None,
            "-x",
            "--proxy",
            show_default=True,
            help="proxy URL that can be used during requests"),
        verbose: bool = typer.Option(
            False,
            "-v",
            "--verbose",
            show_default=True,
            help="If you wanna a verbose mode logging")):
    """
    Download full-text papers using the search results.

    If you've done your search, (probably made the search refinement too) and wanna download the papers, 
    this is the command that you need to call. This command will try to download the PDF version of the papers to
    the output directory path.

    You can download only the selected papers by using the -s (or --selected) flag

    You can filter which kind of categorized papers will be downloaded providing 
    a comma-separated list of categories is provided by the -c (or --categories) argument, 
    You need to define these categories following the pattern: <facet>:<term_b>,<term_c>,...

    E.g.: 
    --categories "Contribution:Metric,Tool"
    
    The -c parameter can be defined several times, so you can define as many filters as you want
    The -c parameter is case-sensitive.

    We use some heuristics to do our job, but sometime they won't work properly, and we cannot be able
    to download the papers, but we logging the downloads or failures in a file download.log
    placed on the output directory, you can check out the log to find what papers cannot be downloaded
    and try to get them manually later. 

    Note: Some papers are behind a paywall and won't be able to be downloaded by this command. 
    However, if you have a proxy provided for the institution where you study or work that permit you 
    to "break" this paywall. You can use this proxy configuration here
    by setting the environment variable FINDPAPERS_PROXY.
    
    You can control the command logging verbosity by the -v (or --verbose) argument.
    """

    try:
        common_util.logging_initialize(verbose)

        categories_by_facet = {} if len(categories) > 0 else None
        for categories_string in categories:
            string_split = categories_string.split(':')
            facet = string_split[0].strip()
            categories_by_facet[facet] = [
                x.strip() for x in string_split[1].split(',')
            ]

        findpapers.download(filepath, outputpath, only_selected_papers,
                            categories_by_facet, proxy)
    except Exception as e:
        if verbose:
            logging.debug(e, exc_info=True)
        else:
            typer.echo(e)
        raise typer.Exit(code=1)
Esempio n. 5
0
def refine(
        filepath: str = typer.Argument(
            ..., help='A valid file path for the search result file'),
        categories: List[str] = typer.
    Option(
        [],
        "-c",
        "--categories",
        show_default=True,
        help=
        "A comma-separated list of categories to assign to the papers with their facet following the pattern: <facet>:<term_b>,<term_c>,..."
    ),
        highlights: str = typer.
    Option(
        None,
        "-h",
        "--highlights",
        show_default=True,
        help="A comma-separated list of terms to be highlighted on the abstract"
    ),
        show_abstract: bool = typer.
    Option(
        False,
        "-a",
        "--abstract",
        show_default=True,
        help="A flag to indicate if the paper's abstract should be shown or not"
    ),
        show_extra_info: bool = typer.
    Option(
        False,
        "-e",
        "--extra-info",
        show_default=True,
        help=
        "A flag to indicate if the paper's extra info should be shown or not"),
        only_selected_papers: bool = typer.Option(
            False,
            "-s",
            "--selected",
            show_default=True,
            help="If only the selected papers will be refined"),
        only_removed_papers: bool = typer.Option(
            False,
            "-r",
            "--removed",
            show_default=True,
            help="If only the removed papers will be refined"),
        read_only: bool = typer.
    Option(
        False,
        "-l",
        "--list",
        show_default=True,
        help=
        "If this flag is present, this function call will only list the papers"
    ),
        verbose: bool = typer.Option(
            False,
            "-v",
            "--verbose",
            show_default=True,
            help="If you wanna a verbose mode logging")):
    """
    Refine the search results by selecting/classifying the papers.

    When you have a search result and wanna refine it, this is the command that you'll need to call.
    This command will iterate through all the papers showing their collected data,
    then asking if you wanna select a particular paper or not

    You can show or hide the paper abstract by using the -a (or --abstract) flag.

    If a comma-separated list of categories is provided by the -c (or --categories) argument, 
    you can assign a category to the paper. You need to define these categories following the pattern: <facet>:<term_b>,<term_c>,...

    E.g.: 
    --categories "Contribution:Metric,Tool,Model,Method"
    --categories "Research Type:Validation Research,Evaluation Research,Solution Proposal,Philosophical,Opinion,Experience"
    
    The -c parameter can be defined several times, so you can define as many facets as you want
    The -c parameter is case-sensitive.

    And to help you on the refinement, this command can also highlight some terms on the paper's abstract 
    by a provided comma-separated list of them provided by the -h (or --highlights) argument.

    You can control the command logging verbosity by the -v (or --verbose) argument.
    """

    try:
        common_util.logging_initialize(verbose)
        highlights = [x.strip() for x in highlights.split(',')
                      ] if highlights is not None else None

        categories_by_facet = {} if len(categories) > 0 else None
        for categories_string in categories:
            string_split = categories_string.split(':')
            facet = string_split[0].strip()
            categories_by_facet[facet] = [
                x.strip() for x in string_split[1].split(',')
            ]

        findpapers.refine(filepath, categories_by_facet, highlights,
                          show_abstract, show_extra_info, only_selected_papers,
                          only_removed_papers, read_only)
    except Exception as e:
        if verbose:
            logging.debug(e, exc_info=True)
        else:
            typer.echo(e)
        raise typer.Exit(code=1)
Esempio n. 6
0
def search(
        outputpath: str = typer.
    Argument(
        ...,
        help=
        'A valid file path where the search result JSON file will be placed'),
        query: str = typer.
    Option(
        None,
        "-q",
        "--query",
        show_default=True,
        help=
        'A query string that will be used to perform the papers search (If not provided it will be loaded from the environment variable FINDPAPERS_QUERY). E.g. [term A] AND ([term B] OR [term C]) AND NOT [term D]'
    ),
        query_filepath: str = typer.
    Option(
        None,
        "-f",
        "--query-file",
        show_default=True,
        help=
        'A file path that contains the query string that will be used to perform the papers search'
    ),
        since: datetime = typer.
    Option(
        None,
        "-s",
        "--since",
        show_default=True,
        help=
        "A lower bound (inclusive) date that will be used to filter the search results. Following the pattern YYYY-MM-DD. E.g. 2020-12-31",
        formats=["%Y-%m-%d"]),
        until: datetime = typer.
    Option(
        None,
        "-u",
        "--until",
        show_default=True,
        help=
        "A upper bound (inclusive) date that will be used to filter the search results. Following the pattern YYYY-MM-DD. E.g. 2020-12-31",
        formats=["%Y-%m-%d"]),
        limit: int = typer.Option(None,
                                  "-l",
                                  "--limit",
                                  show_default=True,
                                  help="The max number of papers to collect"),
        limit_per_database: int = typer.Option(
            None,
            "-ld",
            "--limit-db",
            show_default=True,
            help="The max number of papers to collect per each database"),
        databases: str = typer.
    Option(
        None,
        "-d",
        "--databases",
        show_default=True,
        help=
        "A comma-separated list of databases where the search should be performed, if not specified all databases will be used (this parameter is case insensitive)"
    ),
        publication_types: str = typer.
    Option(
        None,
        "-p",
        "--publication-types",
        show_default=True,
        help=
        "A comma-separated list of publication types to filter when searching, if not specified all the publication types will be collected (this parameter is case insensitive). The available publication types are: journal, conference proceedings, book, other"
    ),
        scopus_api_token: str = typer.
    Option(
        None,
        "-ts",
        "--token-scopus",
        show_default=True,
        help=
        "A API token used to fetch data from Scopus database. If you don't have one go to https://dev.elsevier.com and get it. (If not provided it will be loaded from the environment variable FINDPAPERS_SCOPUS_API_TOKEN)"
    ),
        ieee_api_token: str = typer.
    Option(
        None,
        "-ti",
        "--token-ieee",
        show_default=True,
        help=
        "A API token used to fetch data from IEEE database. If you don't have one go to https://developer.ieee.org and get it. (If not provided it will be loaded from the environment variable FINDPAPERS_IEEE_API_TOKEN)"
    ),
        proxy: str = typer.Option(
            None,
            "-x",
            "--proxy",
            show_default=True,
            help="proxy URL that can be used during requests"),
        verbose: bool = typer.Option(
            False,
            "-v",
            "--verbose",
            show_default=True,
            help="If you wanna a verbose mode logging")):
    """
        Search for papers metadata using a query.

        When you have a query and needs to get papers using it, this is the command that you'll need to call.
        This command will find papers from some databases based on the provided query.

        All the query terms need to be enclosed by square brackets and can be associated using boolean operators,
        and grouped using parentheses. The available boolean operators are "AND", "OR". "NOT".
        All boolean operators needs to be uppercased. The boolean operator "NOT" must be preceded by an "AND" operator.

        E.g.: [term A] AND ([term B] OR [term C]) AND NOT [term D]

        You can use some wildcards in the query too. Use ? to replace a single character or * to replace any number of characters.

        E.g.: 'son?' -> will match song, sons, ...

        E.g.: 'son*' -> will match song, sons, sonar, songwriting, ...

        Nowadays, we search for papers on ACM, arXiv, IEEE, PubMed, and Scopus database.
        The searching on IEEE and Scopus requires an API token, that must to be provided
        by the user using the -ts (or --scopus_api_token) and -te (or --ieee_api_token) arguments.
        If these tokens are not provided the search on these databases will be skipped.

        You can constraint the search by date using the -s (or --since) and -u (or --until) arguments
        following the pattern YYYY-MM-DD (E.g. 2020-12-31). 
        
        You can restrict the max number of retrieved papers by using -l (or --limit).
        And, restrict the max number of retrieved papers by database using -ld (or --limit_per_database) argument.

        You can control which databases you would like to use in your search by the -d (or --databases) option. This parameter
        accepts a comma-separated list of database names, and is case-insensitive. Nowadays the available databases are
        ACM, arXiv, IEEE, PubMed, Scopus

        E.g.:
        --databases "scopus,arxiv,acm"
        --databases "ieee,ACM,PubMed"

        You can control which publication types you would like to fetch in your search by the -p (or --publication-types) option. This parameter
        accepts a comma-separated list of database names, and is case-insensitive. Nowadays the available publication types are
        journal, conference proceedings, book, other. 
        When a particular publication does not fit into any of the other types it is classified as "other", e.g., magazines, newsletters, unpublished manuscripts.

        E.g.:
        --publication-types "journal,conference proceedings,BOOK,other"
        --publication-types "Journal,book"

        You can control the command logging verbosity by the -v (or --verbose) argument.
    """

    try:
        since = since.date() if since is not None else None
        until = until.date() if until is not None else None
        databases = [x.strip() for x in databases.split(',')
                     ] if databases is not None else None
        publication_types = [x.strip() for x in publication_types.split(',')
                             ] if publication_types is not None else None

        common_util.logging_initialize(verbose)

        if query is None and query_filepath is not None:
            with open(query_filepath, 'r') as f:
                query = f.read().strip()

        findpapers.search(outputpath, query, since, until, limit,
                          limit_per_database, databases, publication_types,
                          scopus_api_token, ieee_api_token, proxy)
    except Exception as e:
        if verbose:
            logging.debug(e, exc_info=True)
        else:
            typer.echo(e)
        raise typer.Exit(code=1)
Esempio n. 7
0
def download(search_path: str,
             output_directory: str,
             only_selected_papers: Optional[bool] = False,
             categories_filter: Optional[dict] = None,
             proxy: Optional[str] = None,
             verbose: Optional[bool] = False):
    """
    If you've done your search, (probably made the search refinement too) and wanna download the papers, 
    this is the method that you need to call. This method will try to download the PDF version of the papers to
    the output directory path.

    We use some heuristics to do our job, but sometime they won't work properly, and we cannot be able
    to download the papers, but we logging the downloads or failures in a file download.log
    placed on the output directory, you can check out the log to find what papers cannot be downloaded
    and try to get them manually later. 

    Note: Some papers are behind a paywall and won't be able to be downloaded by this method. 
    However, if you have a proxy provided for the institution where you study or work that permit you 
    to "break" this paywall. You can use this proxy configuration here
    by setting the environment variables FINDPAPERS_HTTP_PROXY and FINDPAPERS_HTTPS_PROXY.

    Parameters
    ----------
    search_path : str
        A valid file path containing a JSON representation of the search results
    output_directory : str
        A valid file path of the directory where the downloaded papers will be placed
    only_selected_papers : bool, False by default
        If only the selected papers will be downloaded
    categories_filter : dict, None by default
        A dict of categories to be used to filter which papers will be downloaded
    proxy : Optional[str], optional
        proxy URL that can be used during requests. This can be also defined by an environment variable FINDPAPERS_PROXY. By default None
    verbose : Optional[bool], optional
        If you wanna a verbose logging
    """

    common_util.logging_initialize(verbose)

    if proxy is not None:
        os.environ['FINDPAPERS_PROXY'] = proxy

    search = persistence_util.load(search_path)

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    log_filepath = os.path.join(output_directory, 'download.log')

    common_util.check_write_access(log_filepath)

    with open(log_filepath,
              'a' if os.path.exists(log_filepath) else 'w') as fp:
        now = datetime.datetime.now()
        fp.write(
            f"------- A new download process started at: {datetime.datetime.strftime(now, '%Y-%m-%d %H:%M:%S')} \n"
        )

    for i, paper in enumerate(search.papers):

        logging.info(f'({i+1}/{len(search.papers)}) {paper.title}')

        if (only_selected_papers and not paper.selected) or \
        (categories_filter is not None and (paper.categories is None or not paper.has_category_match(categories_filter))):
            continue

        downloaded = False
        output_filename = f'{paper.publication_date.year}-{paper.title}'
        output_filename = re.sub(r'[^\w\d-]', '_',
                                 output_filename)  # sanitize filename
        output_filename += '.pdf'
        output_filepath = os.path.join(output_directory, output_filename)

        if os.path.exists(output_filepath):  # PDF already collected
            logging.info(f'Paper\'s PDF file has already been collected')
            continue

        if paper.doi is not None:
            paper.urls.add(f'http://doi.org/{paper.doi}')

        for url in paper.urls:  # we'll try to download the PDF file of the paper by its URLs
            try:
                logging.info(f'Fetching data from: {url}')

                response = common_util.try_success(
                    lambda url=url: DefaultSession().get(url), 2)

                if response is None:
                    continue

                if 'text/html' in response.headers.get('content-type').lower():

                    response_url = urllib.parse.urlsplit(response.url)
                    response_query_string = urllib.parse.parse_qs(
                        urllib.parse.urlparse(response.url).query)
                    response_url_path = response_url.path
                    host_url = f'{response_url.scheme}://{response_url.hostname}'
                    pdf_url = None

                    if response_url_path.endswith('/'):
                        response_url_path = response_url_path[:-1]

                    response_url_path = response_url_path.split('?')[0]

                    if host_url in ['https://dl.acm.org']:

                        doi = paper.doi
                        if doi is None and response_url_path.startswith(
                                '/doi/'
                        ) and '/doi/pdf/' not in response_url_path:
                            doi = response_url_path[4:]
                        elif doi is None:
                            continue

                        pdf_url = f'https://dl.acm.org/doi/pdf/{doi}'

                    elif host_url in ['https://ieeexplore.ieee.org']:

                        if response_url_path.startswith('/document/'):
                            document_id = response_url_path[10:]
                        elif response_query_string.get('arnumber',
                                                       None) is not None:
                            document_id = response_query_string.get(
                                'arnumber')[0]
                        else:
                            continue

                        pdf_url = f'{host_url}/stampPDF/getPDF.jsp?tp=&arnumber={document_id}'

                    elif host_url in [
                            'https://www.sciencedirect.com',
                            'https://linkinghub.elsevier.com'
                    ]:

                        paper_id = response_url_path.split('/')[-1]
                        pdf_url = f'https://www.sciencedirect.com/science/article/pii/{paper_id}/pdfft?isDTMRedir=true&download=true'

                    elif host_url in ['https://pubs.rsc.org']:

                        pdf_url = response.url.replace('/articlelanding/',
                                                       '/articlepdf/')

                    elif host_url in [
                            'https://www.tandfonline.com',
                            'https://www.frontiersin.org'
                    ]:

                        pdf_url = response.url.replace('/full', '/pdf')

                    elif host_url in [
                            'https://pubs.acs.org',
                            'https://journals.sagepub.com',
                            'https://royalsocietypublishing.org'
                    ]:

                        pdf_url = response.url.replace('/doi', '/doi/pdf')

                    elif host_url in ['https://link.springer.com']:

                        pdf_url = response.url.replace(
                            '/article/', '/content/pdf/').replace('%2F',
                                                                  '/') + '.pdf'

                    elif host_url in ['https://www.isca-speech.org']:

                        pdf_url = response.url.replace('/abstracts/',
                                                       '/pdfs/').replace(
                                                           '.html', '.pdf')

                    elif host_url in ['https://onlinelibrary.wiley.com']:

                        pdf_url = response.url.replace('/full/',
                                                       '/pdfdirect/').replace(
                                                           '/abs/',
                                                           '/pdfdirect/')

                    elif host_url in [
                            'https://www.jmir.org', 'https://www.mdpi.com'
                    ]:

                        pdf_url = response.url + '/pdf'

                    elif host_url in ['https://www.pnas.org']:

                        pdf_url = response.url.replace(
                            '/content/', '/content/pnas/') + '.full.pdf'

                    elif host_url in ['https://www.jneurosci.org']:

                        pdf_url = response.url.replace(
                            '/content/', '/content/jneuro/') + '.full.pdf'

                    elif host_url in ['https://www.ijcai.org']:

                        paper_id = response.url.split('/')[-1].zfill(4)
                        pdf_url = '/'.join(response.url.split('/')
                                           [:-1]) + '/' + paper_id + '.pdf'

                    elif host_url in [
                            'https://asmp-eurasipjournals.springeropen.com'
                    ]:

                        pdf_url = response.url.replace('/articles/',
                                                       '/track/pdf/')

                    if pdf_url is not None:

                        response = common_util.try_success(
                            lambda url=pdf_url: DefaultSession().get(url), 2)

                if 'application/pdf' in response.headers.get(
                        'content-type').lower():
                    with open(output_filepath, 'wb') as fp:
                        fp.write(response.content)
                    downloaded = True
                    break

            except Exception as e:  # pragma: no cover
                logging.debug(e, exc_info=True)

        if downloaded:
            with open(log_filepath, 'a') as fp:
                fp.write(f'[DOWNLOADED] {paper.title}\n')
        else:
            with open(log_filepath, 'a') as fp:
                fp.write(f'[FAILED] {paper.title}\n')
                if len(paper.urls) == 0:
                    fp.write(f'Empty URL list\n')
                else:
                    for url in paper.urls:
                        fp.write(f'{url}\n')
def generate_bibtex(search_path: str,
                    outputpath: str,
                    only_selected_papers: Optional[bool] = False,
                    categories_filter: Optional[dict] = None,
                    add_findpapers_citation: Optional[bool] = False,
                    verbose: Optional[bool] = False):
    """
    Method used to generate a BibTeX file from a search result

    Parameters
    ----------
    search_path : str
        A valid file path containing a JSON representation of the search results
    outputpath : str
        A valid file path for the BibTeX output file
    only_selected_papers : bool, optional
        If you only want to generate a BibTeX file for selected papers, by default False
    categories_filter : dict, None by default
        A dict of categories to be used to filter which papers will be downloaded
    add_findpapers_citation : bool, optional
        If you want to add an entry for Findpapers in your BibTeX output file, by default False
    verbose : Optional[bool], optional
        If you wanna a verbose logging
    """

    common_util.logging_initialize(verbose)

    search = persistence_util.load(search_path)
    common_util.check_write_access(outputpath)

    default_tab = ' ' * 4
    bibtex_output = ''

    if add_findpapers_citation:
        bibtex_output = '\n'.join([
            '@misc{grosman2020findpapers', '\ttitle = {Findpapers},',
            '\tauthor = {Grosman, Jonatas},', '\tpublisher = {GitHub},',
            '\tjournal = {GitHub repository},',
            '\thowpublished = {\\url{https://github.com/jonatasgrosman/findpapers}},',
            '\tyear = {2020}', '}\n\n'
        ])

    for paper in search.papers:

        if (only_selected_papers and not paper.selected) or \
        (categories_filter is not None and (paper.categories is None or not paper.has_category_match(categories_filter))):
            continue

        logging.info(f'Exporting bibtex for: {paper.title}')

        try:

            citation_type = '@unpublished'
            if paper.publication is not None:
                if paper.publication.category == 'Journal':
                    citation_type = '@article'
                elif paper.publication.category == 'Conference Proceedings':
                    citation_type = '@inproceedings'
                elif paper.publication.category == 'Book':
                    citation_type = '@book'
                else:
                    citation_type = '@misc'

            bibtex_output += f'{citation_type}{"{"}{paper.get_citation_key()},\n'

            bibtex_output += f'{default_tab}title = {{{paper.title}}},\n'

            if len(paper.authors) > 0:
                authors = ' and '.join(paper.authors)
                bibtex_output += f'{default_tab}author = {{{authors}}},\n'

            if citation_type == '@unpublished':
                note = ''
                if len(paper.urls) > 0:
                    note += f'Available at {list(paper.urls)[0]}'
                if paper.publication_date is not None:
                    note += f' ({paper.publication_date.strftime("%Y/%m/%d")})'
                if paper.comments is not None:
                    note += paper.comments if len(
                        note) == 0 else f' | {paper.comments}'
                bibtex_output += f'{default_tab}note = {{{note}}},\n'
            elif citation_type == '@article':
                bibtex_output += f'{default_tab}journal = {{{paper.publication.title}}},\n'
            elif citation_type == '@inproceedings':
                bibtex_output += f'{default_tab}booktitle = {{{paper.publication.title}}},\n'
            elif citation_type == '@misc' and len(
                    paper.urls) > 0 and paper.publication_date is not None:
                date = paper.publication_date.strftime('%Y/%m/%d')
                url = list(paper.urls)[0]
                bibtex_output += f'{default_tab}howpublished = {{Available at {url} ({date})}},\n'

            if paper.publication is not None and paper.publication.publisher is not None:
                bibtex_output += f'{default_tab}publisher = {{{paper.publication.publisher}}},\n'

            if paper.publication_date is not None:
                bibtex_output += f'{default_tab}year = {{{paper.publication_date.year}}},\n'

            if paper.pages is not None:
                bibtex_output += f'{default_tab}pages = {{{paper.pages}}},\n'

            bibtex_output = bibtex_output.rstrip(
                ',\n') + '\n'  # removing last comma

            bibtex_output += '}\n\n'

        except Exception as e:
            logging.debug(e, exc_info=True)

    with open(outputpath, 'w') as fp:
        fp.write(bibtex_output)