Ejemplo n.º 1
0
def _process_file(mapper: Mapper, fname: str, pool: Pool,
                  file_name_suffix: str, chunksize: int) -> Path:
    """
    Creates a new CSV file by running each row in the input file named fname
    through the mapper using multiple processes.
    """
    ROW_FIELD = "_row"
    try:
        output_path = _output_file_path(fname, file_name_suffix)
        with open(fname, "r") as csvin, open(
                output_path, "w") as csvout, Counter(f"{fname}: ") as counter:
            fieldnames = [ROW_FIELD, SUCCEEDED_FIELD] + mapper.fieldnames
            reader = csv.DictReader(csvin)
            writer = csv.DictWriter(csvout, fieldnames=fieldnames)
            writer.writeheader()
            for i, row in enumerate(
                    pool.imap(mapper.map, reader, chunksize=chunksize), 1):
                counter.next()
                try:
                    row[ROW_FIELD] = i
                    writer.writerow(row)
                    csvout.flush()
                except ValueError as ex:
                    LOG.error("error writing row: %s", str(ex))
        return output_path
    except KeyboardInterrupt:
        sys.stderr.write("Cancelled by Ctrl-C!\n")
        pool.terminate()
        pool.join()
        sys.exit(130)
Ejemplo n.º 2
0
def getProjectLinks(url):
    print("Gathering links...")
    projectLinks = set()
    browser = webdriver.Firefox("C:\Program Files\gecko")
    browser.get(url)
    #Wait for website to load
    time.sleep(5)
    #Closes cookie-acceptance pop-up
    try:
        browser.find_element_by_css_selector(
            '#CybotCookiebotDialogBodyLevelButtonAccept').click()
    except:
        pass
    counter = Counter("Propagating webpages")
    #Simulates click on 'view more' button until exhausted
    while (True):
        try:
            #waits random time to prevent DDOS ban
            time.sleep(random.uniform(0.2, 1.3))
            browser.find_element_by_css_selector('a.ng-isolate-scope').click()
        except:
            break
        counter.next()
    counter.finish()
    links = browser.find_elements_by_xpath("//a[@href]")
    for bigl in links:
        #Ensures that the link is a valid project
        if "/projects/" in bigl.get_attribute(
                "href") and "/coming_soon/" not in bigl.get_attribute("href"):
            projectLinks.add(bigl.get_attribute("href"))
    browser.quit()
    return projectLinks
Ejemplo n.º 3
0
def split_video(filepath: str, output_dir: str, metadata):
    files = glob.glob(filepath + '/*')
    for f in files:
        os.remove(f)

    cap = cv.VideoCapture(filepath)

    prev = None
    curr = None
    i = 0
    mse = 100
    counter = Counter("Splitting ")
    while cap.isOpened():
        ret, curr = cap.read()
        # if frame is read correctly ret is True
        if not ret:
            break

        if prev is not None:
            mse = (np.square(curr - prev)).mean(axis=None)
        if mse > MSE_THRESH:
            new_size = (curr.shape[1] // DOWNSCALE, curr.shape[0] // DOWNSCALE)
            resized = cv.resize(curr, new_size)
            cv.imwrite(output_dir + FORMAT_STRING.format(i), resized)
        prev = curr
        i += 1
        counter.next()
    cap.release()
    yield
Ejemplo n.º 4
0
async def export_users(ctx: ExporterContext):
    users_generator = await utils.with_retry(ctx.slack_client.users_list)
    all_users = []

    counter = Counter("Exporting users ")

    try:
        async for users in users_generator:
            all_users.extend(users["members"])
            for user in users["members"]:
                user_obj = models.SlackUser(user)
                counter.next()

                for url, filename in user_obj.get_exportable_data():
                    full_filename = os.path.join(constants.USERS_EXPORT_DIR,
                                                 filename)
                    ctx.downloader.enqueue_download(url, full_filename)

        await ctx.downloader.flush_download_queue()
    except SlackApiError as e:
        log.error("Got an API error while trying to export user info",
                  exc_info=e)

    ctx.downloader.write_json(
        os.path.join(constants.USERS_EXPORT_DIR, constants.USERS_JSON_FILE),
        all_users)
    counter.finish()
Ejemplo n.º 5
0
def count_files(inp):
    counter = Counter('Loading files tree... ')
    t = 0
    for dirpath, dirs, files in os.walk(inp):
        for filename in files:
            t += 1
            counter.next()
    counter.finish()
    print
    return t
Ejemplo n.º 6
0
 def start_sampling(self, percentile_samples: int, info: str) -> None:
     """To run exact compute of the interval or select randomly a subset of all
     possible programs (combining the values of its annotations) to perform an
     approximation of the exact interval"""
     ##To find all possible consistent programs
     #local_n_programs = {}
     n_used_vars = len(self.used_vars)
     poss_prog_format = '{0:0' + str(n_used_vars) + 'b}'
     poss_asignations = pow(2, n_used_vars)
     counter = Counter('Processing possible programs (%d): ' %
                       (poss_asignations),
                       max=poss_asignations)
     for asignation in range(poss_asignations):
         asign_list = list(poss_prog_format.format(asignation))
         unique_world_program = ['x'] * self.utils.em_vars
         for index, value in enumerate(asign_list):
             unique_world_program[int(self.used_vars[index])] = int(value)
         prog, id_prog = self.utils.map_world_to_prog(unique_world_program)
         if id_prog not in self.local_n_programs:
             self.local_n_programs.append(id_prog)
             #evidence = {str(self.used_vars[index]):int(val) for index, val in enumerate(asign_list)}
             #self.local_n_programs[id_prog] = self.utils.em.get_sampling_prob(evidence)
         counter.next()
     counter.finish()
     ##
     #n_programs = self.utils.get_n_programs()
     n_programs = len(self.local_n_programs)
     print("Number of programs: " + str(n_programs))
     if percentile_samples == 100:
         # To compute the exact interval
         lit_to_query = self.utils.search_lit_to_consult()
         n_samples = n_programs
         unique_programs = self.local_n_programs
         #unique_programs = range(n_programs)
         repeated_programs = 0  # ????
     else:
         lit_to_query = self.utils.get_interest_lit()
         n_samples = int(get_percentile(percentile_samples, n_programs))
         sampled_programs = np.random.choice(self.local_n_programs,
                                             n_samples,
                                             replace=True)
         #sampled_programs = np.random.choice(n_programs, n_samples, replace=True)
         unique_programs = list(set(sampled_programs))
         repeated_programs = n_samples - len(unique_programs)
     prog_data = self.consult_programs(unique_programs, self.adapted_annots,
                                       lit_to_query)
     execution_time, inconsistent_programs = prog_data
     self.results['data'] = {
         'n_samples': n_samples,
         'time': execution_time,
         'repeated_programs': repeated_programs,
         'inconsistent_programs': inconsistent_programs,
         'worlds_consulted': len(self.known_evidences)
     }
     write_results(self.results, self.utils.save_path, info)
Ejemplo n.º 7
0
    def import_NICE_KSAT(self, workbook):
        """Import the NICE KSATs and their relationships with Workroles

        :param workbook: NICE CWF spreadsheet represented as a python class
        :type workbook: class 'xlrd.book.Book'
        """

        log.info("Parsing NICE CWF KSATs")
        bar = Counter(
            'Parsing NICE CWF KSATs ',
            suffix='%(percent)d%% (%(index)d/%(max)d) [%(elapsed_td)s]')

        all_sheets = workbook.sheets()

        graph = self.db.graph

        for sheet in all_sheets:
            if not re.match(r"[A-Z]+-[A-Z]+-[0-9]+", sheet.name):
                continue

            workrole_id = re.match(r"([A-Z]{2}-[A-Z]{3}-[0-9]{3})",
                                   sheet.name)[1]

            for row in sheet._cell_values:
                # capture and store the KSAT unless it's a header row
                try:
                    ksat = parse_ksats(row[0])[0]
                except Exception:
                    # Ignore header rows that don't contain KSATs
                    continue

                ksat_node = KSAT()
                ksat_node.id = ksat
                ksat_node.description = row[1]
                ksat_node.type = ksat_id_to_type(ksat)

                # create the node if it doesn't exist
                graph.create(ksat_node)

                # pull the current relationships from the db
                graph.pull(ksat_node)

                ksat_node.__node__.add_label(ksat_node.type.capitalize())

                ksat_node.nice_workrole.add(
                    NICEWorkrole.match(graph, workrole_id).first())

                # store the updated relationship in the db
                graph.push(ksat_node)

                bar.next()

        bar.finish()
        log.info("Done Parsing NICE CWF KSATs")
Ejemplo n.º 8
0
    def analyzeArticles(self,
                        preprocessor: preprocessing.Preprocessor,
                        dtype='reuters'):
        #check data type
        if dtype == 'reuters':
            #initialize SoupLoader
            soupLoader = data.SoupLoader(-1)
            provider = data.ReutersProvider(soupLoader)
        else:
            provider = data.TwentyNewsProvider('../TwentyNews/')

        #start Counters
        bar = PCounter("Analyzing Articles: ")
        counter = Counter()
        occurances = Counter()
        categories = Counter()
        while True:
            try:
                #increase bar progress
                bar.next()

                #throws an exception if there are no more articles. saving is not needed
                article = data.ArticleFactory.GET_NEXT_ARTICLE(provider)

                #update the counter with the preprocessed array of words
                words = preprocessor.process(article).preprocessed
                counter.update(words)
                #update in how many articles these words occur
                occurances.update(list(words.keys()))
                #update categories counter
                categories.update([article.category])

            except data.OutOfArticlesError:
                #abort while loop. No more Articles
                break

        bar.finish()

        self._articleCount = bar.index
        self._words = self.cropWords(counter, occurances)
        self._categories = categories
Ejemplo n.º 9
0
 def __init__(self, method, query_ops):
     if method == Collector._search:
         self.dates = (query_ops['since'], query_ops['until'])
     elif method == Collector._stream:
         self.dates = (datetime.date.today(), query_ops['until'])
     else:
         self.dates = None
     if self.dates:
         days = (self.dates[1] - self.dates[0]).days
         self._progress = Bar('Processing day: ', max=days)
     else:
         self._progress = Counter('Processing tweets: ')
Ejemplo n.º 10
0
    def __init__(self):
        super().__init__()
        self.counter = 0
        self.run_forever = True
        self.limit = 0
        if len(sys.argv) > 1:
            # No user-provided value, run forever
            self.limit = int(sys.argv[1])
            self.run_forever = False
            self.bar = Bar('Collecting tweets...', max=self.limit)

        else:
            self.bar = Counter('Collecting tweets...')
Ejemplo n.º 11
0
def load_tweets():
    training_data_load_file = open("data_files/perceptron_traindata.json", "r")
    training_data = json.load(training_data_load_file)
    preprocessed_training_data = {}
    stop_terms = generate_stop_terms("config_files/preprocess_stop_terms.txt")
    counter = Counter("Loading tweets...")
    for (tweet, rating) in training_data.items():
        new_tweet = preprocess(tweet, stop_terms)
        new_tweet_text = " ".join(new_tweet)
        preprocessed_training_data[new_tweet_text] = rating
        counter.next()
    counter.finish()
    return preprocessed_training_data
Ejemplo n.º 12
0
 def consult_programs(self, unique_programs: list, adapted_annots: dict,
                      lit_to_query: list) -> list:
     """To iterate over sampled programs consulting for literals"""
     self.results['status'] = {
         lit: copy.copy(STATUS)
         for lit in lit_to_query
     }
     # To count the number of inconsistent programs sampled
     inconsistent_programs = 0
     counter = Counter('Processing programs (%d): ' %
                       (len(unique_programs)),
                       max=len(unique_programs))
     initial_time = time.time()
     for sampled_prog in unique_programs:
         sampled_in_bin = self.utils.id_prog_to_bin(sampled_prog)
         # Build the program from the sampled annotations
         #self.replace_in_program(sampled_in_bin)
         # To create the expression that generate a sampled program
         expression = ''
         for index, value in enumerate(sampled_in_bin):
             if self.utils.prog_in_bin[index] == 'x':
                 if value == 1:
                     expression += adapted_annots[index]['True'] + ' & '
                 else:
                     expression += adapted_annots[index]['False'] + ' & '
         flag = False
         #program = self.utils.map_bin_to_prog(self.utils.prog_in_bin)
         program = self.utils.map_bin_to_prog(sampled_in_bin)
         status = query_to_delp(program, lit_to_query)
         prob = float(0.0)
         models = satisfiable(eval(expression[:-3]), all_models=True)
         for model in models:
             if model:
                 # The sampled program is consistent, is a valid program
                 evidence = to_evidence(model)
                 if evidence not in self.known_evidences:
                     # Get probability of the new evidence
                     prob += self.utils.em.get_sampling_prob(evidence)
                     self.known_evidences.append(evidence)
             else:
                 # To sampled program is inconsistent
                 inconsistent_programs += 1
                 flag = True
         if not flag:
             self.update_results(status, prob)
         self.update_results(status, prob)
         counter.next()
     counter.finish()
     print(self.utils.model_path + " <<Complete>>")
     execution_time = time.time() - initial_time
     return [execution_time, inconsistent_programs]
Ejemplo n.º 13
0
 def consult_worlds(self, worlds: list, lit_to_query: list) -> float:
     """To iterate over sampled worlds consulting for literals"""
     self.results['status'] = {
         lit: copy.copy(STATUS)
         for lit in lit_to_query
     }
     # To control if worlds are sampled or generated
     if isinstance(worlds[0], (int, np.int64)):
         to_convert = 'self.utils.id_world_to_bin(sampled_world)'
     else:
         to_convert = 'sampled_world'
     counter = Counter('Processing worlds: ', max=len(worlds))
     initial_time = time.time()
     for sampled_world in worlds:
         # Get world in list format
         world, evidence = eval(to_convert)
         # Get the probability of the world
         prob_world = self.utils.em.get_sampling_prob(evidence)
         # Build the program for world
         program, id_program = self.utils.map_world_to_prog(world)
         status = self.known_progs.search_sample(id_program)
         if status == -1:
             # New program
             status = query_to_delp(program, lit_to_query)
             self.known_progs.save_sample(id_program, status)
             for literal, response in status.items():
                 # Update number of worlds
                 self.results['status'][literal][response['status']] += 1
                 # Update probabilities
                 self.results['status'][literal][
                     'p' + response['status']] += prob_world
                 # Save time to compute the query in the world
                 self.results['status'][literal]['time'] += response['time']
         else:
             # Known program
             for literal, response in status.items():
                 # Update number of worlds
                 self.results['status'][literal][response['status']] += 1
                 # Update probabilities
                 self.results['status'][literal][
                     'p' + response['status']] += prob_world
                 # Save time to compute the query in the world
                 self.results['status'][literal]['time'] += 0
         counter.next()
     counter.finish()
     print(self.utils.model_path + " <<Complete>>")
     execution_time = time.time() - initial_time
     return execution_time
Ejemplo n.º 14
0
def cli_runner(*varags, **args):
    signal(SIGINT, ctrl_c_handler)  # Handle Ctrl + C

    settings = Settings.instance()
    settings.set(args)

    if settings.debug:
        print("Command line inputs: " + str(varags))

    counter = None
    if settings.ui:
        counter = Counter('Discovering Files: ')

    def counter_func():
        if counter != None:
            counter.next()

    files_to_process = []
    if len(varags) > 0:
        for arg in varags:
            files_to_process += find_files(arg, counter_func)
    else:
        if settings.debug:
            print("No inputs provided, Scanning local directory")
        files_to_process = find_files(PWD, counter_func)

    if counter:
        counter.finish()

    problematic_certs = process_certs(files_to_process, settings)

    if settings.save_results:
        save_file(problematic_certs)

    for problem in problematic_certs:
        print(problem)

    if settings.send_to_slack and len(problematic_certs) > 0:
        send_to_slack(problematic_certs)
Ejemplo n.º 15
0
def main():
    args = get_arguments()
    if args.verbosity >= 1:
        print("\n--- Generating barcodes ---")
    barcodes =[]
    for i in range(args.numOfBc):
        barcode = generate_barcode(args.length, "")
        if not len(barcodes) == 0:
            if args.verbosity >=2:
                c = Counter("    Barcode {} candidate no: ".format(i+1))
            while (not 0.4<=gc_content(barcode)<=0.6) or min(distance(barcode,previous_bc) for previous_bc in barcodes)<=args.distance:
                barcode = generate_barcode(args.length, "")
                if args.verbosity >=2:
                    c.next()
            if args.verbosity >=2:
                c.finish()
                print("")
        barcodes.append(barcode)
        if args.verbosity >= 1:
            print('Barcode {}: {}'.format(i+1, barcode))
            print('GC-content: {}'.format(gc_content(barcode)))
    write_2_file(barcodes, args.output, args.verbosity)
    return
Ejemplo n.º 16
0
    def get_messages(self):
        # Get all messages of user
        # Output format:
        # [{'id': '13c...7', 'threadId': '13c...7'}, ...]

        # if os.path.exists("messages.pickle"):
        #     with open("messages.pickle", "rb") as token:
        #         messages = pickle.load(token)
        #         return messages

        # includeSpamTrash
        # labelIds

        response = self.service.users().messages().list(
            userId=self.user_id).execute()
        messages = []
        est_max = response["resultSizeEstimate"] * 5

        progress = Counter(
            f"{helpers.loader_icn} Fetching messages page ".ljust(
                _progressPadding, " "))

        if "messages" in response:
            messages.extend(response["messages"])

        while "nextPageToken" in response:
            page_token = response["nextPageToken"]

            response = (self.service.users().messages().list(
                userId=self.user_id, pageToken=page_token).execute())
            messages.extend(response["messages"])

            progress.next()

        progress.finish()

        return messages
Ejemplo n.º 17
0
async def export_files(ctx: ExporterContext):
    files_generator = utils.AsyncIteratorWithRetry(
        ctx.slack_client.files_list,
        count=constants.ITEM_COUNT_LIMIT,
        ts_to=ctx.export_time  #, ts_from=ctx.last_export_time
    )
    all_files = []

    counter = Counter("Exporting files ")

    try:
        await files_generator.run()

        async for file_resp in files_generator:
            all_files.extend(file_resp["files"])
            for sfile in file_resp["files"]:
                file_obj = models.SlackFile(sfile)
                export_file(ctx, file_obj)
                counter.next()

            try:
                await ctx.downloader.flush_download_queue()
            except utils.AggregateError as e:
                log.warning(
                    f"Caught {len(e.errors)} errors while downloading files.")

                for err in e.errors:
                    log.warning(str(err))
    except SlackApiError as e:
        log.error(f"Got an API error while trying to obtain file info",
                  exc_info=e)

    ctx.downloader.write_json(
        os.path.join(constants.FILES_EXPORT_DIR, constants.FILES_JSON_FILE),
        all_files)
    counter.finish()
Ejemplo n.º 18
0
def main():
    # open the books.csv file
    inloop = True
    books_csvfile_path = "../books.csv"
    books_csvfile = None
    while inloop:
        try:
            if (books_csvfile_path.endswith("books.csv")):
                books_csvfile = open(os.path.realpath(books_csvfile_path))
                inloop = False
            else:
                raise FileNotFoundError()
        except FileNotFoundError as error:
            print("We couldn't find the 'books.csv' file.\n")
            books_csvfile_path = input(
                "Please input the 'books.csv' absolute file path: ")
            inloop = True

    books = csv.reader(books_csvfile)

    # itterate through all books
    row_count = 0
    print('    This process may take long')
    progress = Counter('    - Importing books: ')
    for isbn, title, author, year in books:
        # this skips first line of the file because it contains the csv headers.
        if not (row_count == 0):
            book = Book(isbn, title, author, year)
            book.insertToTable()
            progress.next()
        row_count += 1

    db.commit()
    progress.finish()
    db.remove()
    print("    Books imported succesfully!")
Ejemplo n.º 19
0
 def add_progress_bar(self):
     self.progress_indicator = Counter(self.name)
Ejemplo n.º 20
0
    def scrape(self, filename):
        """
        Scrapes metadata of S2ORC articles from given file

        :param filename: name of file in data folder to scrape from
        """
        print(
            f'Collection: {self._collection.database.name}.{self._collection.name}. Database: S2ORC. File: {filename}'
        )

        abstracts = []
        articles = []
        no_id = 0
        unreadable = 0

        # counter
        counter = Counter(message='Articles analyzed: ')

        file = open(os.path.join(DATA_PATH, filename), 'r')

        # load GB to US dictionary
        with open('miscellaneous/us_gb_dict.txt', 'r') as convert:
            spelling = json.load(convert)
        print('Stored json dictionary in memory')

        for data in file:
            article = json.loads(data)

            # ignore abstract if article is not from PubMed or PubMedCentral
            uid = article.get('pubmed_id')
            pmc = article.get('pmc_id')
            doi = article.get('doi')
            paperid = article.get('paper_id')
            if not uid and not pmc and not doi and not paperid:
                no_id += 1
                counter.next()
                continue

            # store abstract text for use by mat2vec below
            abstract = article.get('abstract')

            # continues if paper does not have abstract
            if not abstract:
                unreadable += 1
                counter.next()
                continue

            # replaces ':::' with newline
            abstract = abstract.replace('::: ', '\n')

            # segments abstract by sentence
            doc = self.nlp(abstract)
            sentences = []
            is_unreadable = False

            # processes sentence text using mat2vec processor
            for sent in doc.sents:
                try:
                    tokens, materials = self.processor.process(sent.text)
                except OverflowError:
                    is_unreadable = True
                    break

                processed_sent = ' '.join(
                    [token.lemma_ for token in sent if not token.is_stop])
                for gb, us in spelling.items():
                    processed_sent = processed_sent.replace(gb, us)
                sentences.append(processed_sent)

            # if processor (from above) throws an error, skip the paper
            if is_unreadable:
                unreadable += 1
                counter.next()
                continue

            processed_abstract = '\n'.join(sentences)

            # create new document and store new article document if not in collection
            article = {
                'doi': doi,
                'uid': uid,
                'pmc': pmc,
                'paperid': paperid,
                'title': article.get('title'),
                'abstract': abstract,
                'url': article.get('s2_url'),
                'creators': self._get_creators(article.get('authors')),
                'publication_name': article.get('journal'),
                'year': article.get('year'),
                'database': 's2orc',
                'processed_abstract': processed_abstract
            }
            articles.append(article)
            abstracts.append(processed_abstract)
            counter.next()

            # classify abstracts if 20000 have been stored
            if len(abstracts) == 20000:
                self._store(articles, abstracts)
                articles = []
                abstracts = []
        counter.finish()

        # unreadable papers
        print(f'No ID: {no_id}')
        print(f'Unreadable papers: {unreadable}')

        # classifies and stores metadata
        if abstracts:
            self._store(articles, abstracts)
            print()
        else:
            print('No abstracts to classify.\n')
            return

        # prints classifier metrics
        for classifier in self._classifiers:
            classifier.print_metrics()
            classifier.reset_metrics()

        # prints general tag metrics
        if self._save:
            print(f'Total articles analyzed: {self._gen_total}.')
            print(
                f'Stored {self._gen_new} new abstracts to \'{self._gen_tag}\'.'
            )
            print()
            self._gen_new = 0
            self._gen_total = 0
Ejemplo n.º 21
0
def init(ip, port):
  soc = socket(AF_INET, SOCK_STREAM)
  soc.settimeout(4)
  soc.connect((ip, int(port)))
  soc.send('GET /?{} HTTP/1.1\r\n'.format(randint(0,2000)).encode('utf-8'))
  for header in headers: soc.send('{}\r\n'.format(header).encode('utf-8'))
  return soc

if __name__ == '__main__':
  if len(argv)<5:
    exit(REDC+"Usage: {} ip port count time".format(argv[0]))
  socketList = []
  logger.info('count: {} timer: {}'.format(count, timer))

  bar = Counter(GREENC+'Creating sockets: '+YELLOWC, max=count)
  for _ in range(count):
    try: soc=init(ip, port)
    except error: break
    socketList.append(soc)
    bar.next()

  print()
  while True:
    sendbar = PixelBar(GREYC+'Sending keep-alive Headers'+REDC, max=timer)
    logger.info('Sending keep-alive Headers')
    
    for soc in socketList:
      try: soc.send('X-a {}\r\n'.format(randint(1,6000)).encode('utf-8'))
      except error: socketList.remove(soc)
Ejemplo n.º 22
0
def counter_progress_cli(msg, max=0):
    return Counter(msg + ' - ')
    urlHtml = requests.get(url).content
    # soup object of html
    soup = bs(urlHtml, 'lxml')
    # goes through each tag with an href in the page
    for tag in soup.find_all(href=True):
        # href in tag
        href = tag['href']
        href
        # checks that it is uri and that it is not '/' or '#' and that it isn't already in list
        if (href.startswith('/') and not href.startswith('//')
                and len(href) > 1 and href not in hrefs):
            # adds href to list
            hrefs.append(href)


count = Counter("Scraping : ")
# list for hrefs
hrefs = []
# input for base url,
base = input()
# checks if base ends with '/'
if (base.endswith('/')):
    # postion of last character before '/'
    lastPos = len(base) - 1
    # removes '/' from base
    base = base[0:lastPos]
# gets all the hrefs from base URL
getHrefs(base, "", hrefs)
# loops through hrefs
# hrefs will be added as it loops through each href
# but will end once all hrefs have been check
Ejemplo n.º 24
0
async def export_conversation_history(ctx: ExporterContext,
                                      convo: models.SlackConversation):
    def file_filter(raw_file: Dict[str, Any]) -> bool:
        if "mode" in raw_file and raw_file["mode"] == "tombstone":
            return False

        filename = os.path.join(constants.FILES_EXPORT_DIR, raw_file["id"])

        return not ctx.downloader.exists(filename)

    history_generator = utils.AsyncIteratorWithRetry(
        ctx.slack_client.conversations_history,
        channel=convo.id,
        limit=constants.ITEM_COUNT_LIMIT,
        latest=ctx.export_time,
        oldest=ctx.last_export_time)

    history_folder = os.path.join(ctx.output_directory,
                                  constants.CONVERSATIONS_EXPORT_DIR, convo.id,
                                  constants.HISTORY_JSON_DIR)

    history_fragment = ctx.fragments.create(history_folder)

    temporary_dir = tempfile.TemporaryDirectory()
    temp_fragment = ctx.fragments.create(temporary_dir.name)

    counter = Counter(f"Exporting conversation history ({convo.name}) ")

    try:
        await history_generator.run()

        async for history_resp in history_generator:
            for msg in history_resp["messages"]:
                msg_obj = models.SlackMessage(msg)

                try:
                    if msg_obj.has_files:
                        files = await msg_obj.get_files(ctx, file_filter)

                        for f in files:
                            export_file(ctx, f)
                except SlackApiError as e:
                    log.error(
                        f"Error while obtaining file metadata for message {msg_obj.ts} in channel {convo.id}",
                        exc_info=e)

                try:
                    if msg_obj.has_replies:
                        await msg_obj.populate_replies(ctx, convo)
                except SlackApiError as e:
                    log.error(
                        f"Error while obtaining reply metadata for message {msg_obj.ts} in channel {convo.id}",
                        exc_info=e)

                temp_fragment.append(msg_obj.data)
                counter.next()

            try:
                await ctx.downloader.flush_download_queue()
            except utils.AggregateError as e:
                log.warning(
                    f"Caught {len(e.errors)} errors while downloading files.")

                for err in e.errors:
                    log.warning(str(err))

            temp_fragment.commit_fragments()
    except SlackApiError as e:
        log.error(
            f"Got an API error while trying to obtain conversation history",
            exc_info=e)
    except Exception as e:
        log.error(
            f"Uncaught {e.__class__.__name__}; you may need to do a full resync",
            exc_info=e)

    history_fragment.extend(
        temp_fragment[::-1])  # Slack messages are stored in descending order

    temp_fragment.close()
    history_fragment.close()
    temporary_dir.cleanup()

    counter.finish()
Ejemplo n.º 25
0
def read_tiles(src, min_zoom=0, max_zoom=None, tile_size=256):
    """This function is a generator that reads all tiles 
    that overlap with the extent of src between min_zoom and max_zoom.
    
    Parameters
    ----------
    src : rasterio.DatasetReader
        Input dataset, opened for reading
    min_zoom : int, optional (default 0)
    max_zoom : int, optional (default None)
        If None, max_zoom will be calculated based on the extent of src
    tile_size : int, optional (default 256)
        length and width of tile
    
    Yields
    ------
    tile (mercantile.Tile), tile data (of shape (tile_size, tile_size)), and tile transform
    """
    def _read_tile(vrt, tile, tile_size=256):
        """Read a tile of data from the VRT.

        If the tile bounds fall outside the vrt bounds, we have to calculate
        offsets and widths ourselves (because WarpedVRT does not allow boundless reads)
        and paste the data that were read into an otherwise blank tile (filled with Nodata value).
        
        Parameters
        ----------
        vrt : rasterio.WarpedVRT
            WarpedVRT initialized from the data source.  Example:
                with WarpedVRT(
                    src,
                    crs="EPSG:3857",
                    nodata=src.nodata,
                    resampling=Resampling.nearest,
                    width=tile_size,
                    height=tile_size,
                ) as vrt
        tile : mercantile.Tile
            Tile object describing z, x, y coordinates
        tile_size : int, optional (default 256)
            length and width of tile   

        Returns
        -------
        tuple of numpy array of data with shape (tile_size, tile_size), tile transform object
        """

        tile_bounds = mercantile.xy_bounds(*tile)
        window = vrt.window(*tile_bounds)

        dst_transform = vrt.window_transform(window)
        scaling = Affine.scale(window.width / tile_size,
                               window.height / tile_size)
        dst_transform *= scaling

        x_res = abs(dst_transform.a)
        y_res = abs(dst_transform.e)

        left_offset = max(
            int(round((vrt.bounds[0] - tile_bounds[0]) / x_res, 0)), 0)
        right_offset = max(
            int(round((tile_bounds[2] - vrt.bounds[2]) / x_res, 0)), 0)

        bottom_offset = max(
            int(round((vrt.bounds[1] - tile_bounds[1]) / y_res, 0)), 0)
        top_offset = max(
            int(round((tile_bounds[3] - vrt.bounds[3]) / y_res, 0)), 0)

        width = tile_size - left_offset - right_offset
        height = tile_size - top_offset - bottom_offset

        if not (width > 0 and height > 0):
            # No data can be read within an window that has no width or height
            # so return a blank tile
            data = np.empty((1, tile_size, tile_size), dtype=vrt.dtypes[0])
            data.fill(vrt.nodata)

            return data[0], dst_transform

        data = vrt.read(out_shape=(1, height, width), window=window)

        if width != tile_size or height != tile_size:
            # Create a blank tile (filled with nodata) and paste in data
            out = np.empty((1, tile_size, tile_size), dtype=vrt.dtypes[0])
            out.fill(vrt.nodata)
            out[0, top_offset:top_offset + data.shape[1],
                left_offset:left_offset + data.shape[2], ] = data
            data = out

        return data[0], dst_transform

    with WarpedVRT(
            src,
            crs="EPSG:3857",
            nodata=src.nodata,
            resampling=Resampling.nearest,
            width=tile_size,
            height=tile_size,
    ) as vrt:

        if max_zoom is None:
            get_default_max_zoom(src)

        tiles = mercantile.tiles(*get_geo_bounds(src),
                                 range(min_zoom, max_zoom + 1))

        for tile in Counter("Extracting tiles...    ").iter(tiles):
            data, transform = _read_tile(vrt, tile, tile_size)
            yield tile, data, transform
Ejemplo n.º 26
0
    temporary_directory_path = os.path.abspath(args.temporary_directory_path)
    checkpoint = args.checkpoint

    with open(corpus_path, "r") as corpus_file, tempfile.TemporaryDirectory(
            dir=temporary_directory_path) as tmp_file_dir, Pool(
                processes=cpu_count()) as pool:
        if args.split_by_lines:
            print("Splitting by number of lines")
            articles = extract_article_by_number_of_sentence(
                corpus_file, args.split_by_lines)
        else:
            print("Splitting by blank lines")
            articles = extract_article_by_blank_lines(corpus_file)
        results = []

        for i, article in Counter("Spawning threads...").iter(
                enumerate(articles)):
            if i < checkpoint:
                continue

            while psutil.virtual_memory().free * 0.9 < psutil.Process(
                    os.getpid()).memory_full_info().rss / (i + 1):
                print(f"""Waiting 5s for RAM to be freed. 
                Currently {psutil.virtual_memory().percent}% of RAM is used."""
                      )
                sleep(5)
                print([res.get() for res in results])

            f_args = (
                article,
                output_path,
                vocab_path,
Ejemplo n.º 27
0
    return startupTime, traffic


startupTime = time()
traffic = 0.0

with open('./data.csv', 'r') as csvreader:
    allRows = list(csv.DictReader(csvreader))
    allRowsForWrite = list(allRows)

    i = 0
    ids = []
    urls = []
    scientific_names = []
    bar = Counter('Процесс загрузки: ')
    for row in allRows:
        atexit.register(ataxit_handler, allRowsForWrite=allRowsForWrite)

        i += 1
        if i > 20:
            imagesContent = get_images(urls)
            startupTime, traffic = set_images(ids, scientific_names,
                                              imagesContent, startupTime,
                                              traffic)
            del allRowsForWrite[:20]
            i, ids, urls, scientific_names = 1, [], [], []

        ids.append(row['id'])
        urls.append(row['image_url'])
        scientific_names.append(row['scientific_name'])