Ejemplo n.º 1
0
    def __init__(self, kromsatel_args):
        super().__init__(kromsatel_args)
        self.cleaner = NanoporeReadsCleaner(kromsatel_args)

        self.reads_fpath = self.kromsatel_args.long_read_fpath
        self.chunk_size = self.kromsatel_args.chunk_size

        num_reads_total = \
            _count_unpaired_reads_verbosely(self.reads_fpath)
        self.progress = Progress(num_reads_total)

        output_prefix = fs.rm_fastq_extention(
            os.path.basename(self.reads_fpath)
        )

        if kromsatel_args.split_output:
            self.binner = SplitUnpairedBinner(
                self.kromsatel_args.outdir_path,
                output_prefix,
                self.kromsatel_args.min_len
            )
        else:
            self.binner = SimpleUnpairedBinner(
                self.kromsatel_args.outdir_path,
                output_prefix,
                self.kromsatel_args.min_len
            )
Ejemplo n.º 2
0
def test_add_item_scraped():
    progress = Progress()
    progress.init()
    progress_result = progress.read_progress()
    assert progress_result['items_scraped'] == 0
    assert progress_result['total'] == 0
    assert len(progress_result['items']) == 0

    some_object = {'property': 'value'}
    progress.add_item_scraped(some_object)
    progress_result = progress.read_progress()
    assert len(progress_result['items']) == 1
Ejemplo n.º 3
0
def run_freida_scraping():
    try:
        progress = Progress()
        progress.init()
        progress.save_process_progress(False, False)
        browser = setup_browser()
        config = FreidaConfig(browser)
        url = config.initial_page
        main(browser, url, config)
        browser.quit()
    except Exception:
        if browser:
            browser.quit()
Ejemplo n.º 4
0
def run_scraping_task():
    try:
        progress = Progress()
        progress.init()
        progress.save_process_progress(False,False)
        # TODO Remove if not using selenium
        browser = setup()

        config = Config(browser) #TODO: Pass browser if the crawling is selenium based
        url = config.initial_page
        main(browser, config)
        browser.quit()
    except Exception:
        if browser:
            browser.quit()
Ejemplo n.º 5
0
class LongReadKromsatelCore(KromsatelCore):

    # TODO: do not save reference to kromsatel args by creating some "BlastArguments" class
    def __init__(self, kromsatel_args):
        super().__init__(kromsatel_args)
        self.cleaner = NanoporeReadsCleaner(kromsatel_args)

        self.reads_fpath = self.kromsatel_args.long_read_fpath
        self.chunk_size = self.kromsatel_args.chunk_size

        num_reads_total = \
            _count_unpaired_reads_verbosely(self.reads_fpath)
        self.progress = Progress(num_reads_total)

        output_prefix = fs.rm_fastq_extention(
            os.path.basename(self.reads_fpath)
        )

        if kromsatel_args.split_output:
            self.binner = SplitUnpairedBinner(
                self.kromsatel_args.outdir_path,
                output_prefix,
                self.kromsatel_args.min_len
            )
        else:
            self.binner = SimpleUnpairedBinner(
                self.kromsatel_args.outdir_path,
                output_prefix,
                self.kromsatel_args.min_len
            )
        # end if
    # end def


    def run(self):

        reads_chunks = src.fastq.fastq_chunks_unpaired(
            fq_fpath=self.reads_fpath,
            chunk_size=self.chunk_size
        )

        self.progress.print_status_bar()

        self._clean_chunks(reads_chunks)

        self.progress.print_status_bar()
        print()
    # end def

    def _clean_chunks(self, reads_chunks):
        with mp.Pool(self.threads_num) as pool:
            task_iterator = pool.imap(
                self._clean_nanopore_chunk,
                reads_chunks,
                chunksize=1
            )
            for task in task_iterator:
                pass
            # end for
        # end with

        pool.close()
        pool.join()
    # end def

    def _clean_nanopore_chunk(self, reads_chunk):

        alignments = parse_alignments_nanopore(
            src.blast.blast_align(reads_chunk, self.kromsatel_args)
        )

        self.cleaner.fill_binner(reads_chunk, alignments, self.binner)

        self._write_output()
        increment = len(reads_chunk)
        self._update_progress(increment)
        self._print_progress()
Ejemplo n.º 6
0
class IlluminaPEKromsatelCore(KromsatelCore):

    def __init__(self, kromsatel_args):
        super().__init__(kromsatel_args)
        self.cleaner = IlluminaPEReadsCleaner(kromsatel_args)

        self.frw_read_fpath = self.kromsatel_args.frw_read_fpath
        self.rvr_read_fpath = self.kromsatel_args.rvr_read_fpath
        self.chunk_size = self.kromsatel_args.chunk_size

        num_reads_total = \
            _count_paired_reads_verbosely(self.frw_read_fpath)
        self.progress = Progress(num_reads_total)

        output_prefix = fs.rm_fastq_extention(
            os.path.basename(self.frw_read_fpath)
        )

        if kromsatel_args.split_output:
            self.binner = SplitPairedBinner(
                self.kromsatel_args.outdir_path,
                output_prefix,
                self.kromsatel_args.min_len
            )
        else:
            self.binner = SimplePairedBinner(
                self.kromsatel_args.outdir_path,
                output_prefix,
                self.kromsatel_args.min_len
            )
        # end if
    # end def

    def run(self):

        reads_chunks = src.fastq.fastq_chunks_paired(
            frw_read_fpath=self.frw_read_fpath,
            rvr_read_fpath=self.rvr_read_fpath,
            chunk_size=self.chunk_size
        )

        self.progress.print_status_bar()

        self._clean_chunks(reads_chunks)

        self.progress.print_status_bar()
        print()
    # end def

    def _clean_chunks(self, reads_chunks):
        with mp.Pool(self.threads_num) as pool:
            task_iterator = pool.imap(
                self._clean_illumina_pe_chunk,
                reads_chunks,
                chunksize=1
            )
            for task in task_iterator:
                pass
            # end for
        # end with

        pool.close()
        pool.join()
    # end def

    def _clean_illumina_pe_chunk(self, reads_chunk):

        alignments = self._align_read_pairs(reads_chunk)

        self.cleaner.fill_binner(reads_chunk, alignments, self.binner)

        self._write_output()
        increment = len(reads_chunk[0])
        self._update_progress(increment)
        self._print_progress()
    # end def

    def _align_read_pairs(self, reads_chunk):
        frw_chunk = reads_chunk[0]
        frw_alignments = parse_alignments_illumina(
            src.blast.blast_align(frw_chunk, self.kromsatel_args)
        )

        rvr_chunk = reads_chunk[1]
        rvr_alignments = parse_alignments_illumina(
            src.blast.blast_align(rvr_chunk, self.kromsatel_args)
        )

        alignments = (frw_alignments, rvr_alignments)

        return alignments
Ejemplo n.º 7
0
import sys
import os
from flask_bootstrap import Bootstrap
from flask import Flask, render_template, jsonify, redirect
from src.progress import Progress
from src.crawling_threading import CrawlingThreading

if getattr(sys, 'frozen', False):
    template_folder = os.path.join(sys._MEIPASS, 'templates')
    app = Flask(__name__, template_folder=template_folder)
else:
    template_folder = os.path.join(os.path.dirname(__file__), 'templates')
    app = Flask(__name__, template_folder=template_folder)

Bootstrap(app)
progress = Progress()
progress.init()


@app.route('/')
def app_entrypoint():
    global progress
    try:
        progress_result = progress.read_progress()
    except Exception:
        progress_result = progress
    return render_template('index.html', progress=progress_result)


@app.route('/scrape')
def scrape_beerwulf():
Ejemplo n.º 8
0
 def notify(self, file) -> None:
     self.queue.put(Progress(file, self.done.value, self.failed.value, self.files_count))
Ejemplo n.º 9
0
def test_read_total_number_items():

    progress = Progress()
    progress.init()
    total_number_items = progress.read_total_number_items()
    assert total_number_items == 0
Ejemplo n.º 10
0
def test_save_number_items_scraped_so_far():
    progress = Progress()
    progress.init()
    total_number_items = progress.read_total_number_items()
    items_scraped_so_far = progress.read_number_items_scraped_so_far()
    assert total_number_items == 0
    assert items_scraped_so_far == 0

    progress.save_number_items_scraped_so_far(10)
    progress.save_total_number_items(5)
    items_scraped_so_far = progress.read_number_items_scraped_so_far()
    total_number_items = progress.read_total_number_items()
    assert items_scraped_so_far == 10
    assert total_number_items
Ejemplo n.º 11
0
    def crawl_all_pages_to_end(self, initial_url, file_to_save_results):

        try:
            url_first_load = initial_url
            self.browser.get(
                "https://freida.ama-assn.org/Freida/#/programs?program=residencies&specialtiesToSearch=140"
            )
            self.config.answer_prompt_questions()

            # Wait for page to load
            wait = WebDriverWait(self.browser,
                                 10,
                                 ignored_exceptions=[
                                     StaleElementReferenceException,
                                     NoSuchElementException
                                 ])
            items_presence = EC.presence_of_element_located(
                (By.CSS_SELECTOR, self.config.next_button_css_selector))
            wait.until(items_presence)

            self.browser.execute_script('window.open("' + url_first_load +
                                        '");')

            # Swith to new tab
            self.browser.switch_to.window(self.browser.window_handles[1])

            # Create dataframe to hold the data
            dataframe_file = Path(file_to_save_results)
            if (not dataframe_file.exists()):
                dataframe = pd.DataFrame()
            else:
                dataframe = pd.read_csv(file_to_save_results)

            jsonData = self.loadJsonContent(self.browser.page_source)

            progress = Progress()

            total_number_items = self.total_number_items_to_scrape(jsonData)
            progress.save_total_number_items(total_number_items)
            progress.save_process_progress(False, False)

            for pagination_item in jsonData['solrPagination']:

                url_to_parse = urljoin(self.config.host,
                                       pagination_item["url"])
                self.browser.get(url_to_parse)
                page_json_data = self.loadJsonContent(self.browser.page_source)

                for item in page_json_data["searchResults"]:
                    cleaned_item = self.extract_item_data(item)
                    item_serie = pd.Series(cleaned_item,
                                           index=cleaned_item.keys())

                    dataframe = dataframe.append(item_serie,
                                                 ignore_index=True,
                                                 sort=False)
                    progress.save_number_items_scraped_so_far(
                        dataframe.shape[0])
                    progress.add_item_scraped(cleaned_item)
                    progress.save_process_progress(False, False)

                dataframe.to_csv(file_to_save_results)

            progress.save_process_progress(True, False)
            return True
        except Exception as e:
            print(str(e))
            progress.save_process_progress(True, True)
            return False