def updateDataBase(token): repository = Repository(token) print("Get Java Repositories") repositoriesJava = repository.get_repositories("java", 100) print("Get Python Repositories") repositoriesPython = repository.get_repositories("python", 100) repositories = repositoriesJava + repositoriesPython RepositoryData().update(repositories)
def insert_repo_and_user(gituser, repolist): Session = sessionmaker(bind=engine) session = Session() raw_repos = session.query(Repository).filter( Repository.username == gituser).all() aquired_repos = [] print('Repositorios ja armazenados deste usuario:') for repo in raw_repos: print(f'{repo.username}:{repo.reponame}') aquired_repos.append(repo.reponame) print(f'\nRepositorios do usuario adiquiridos no git: {repolist}\n') delete_list = select_delete_candidates(aquired_repos, repolist) insert_list = select_insert_candidates(aquired_repos, repolist) #print(delete_list) #print(insert_list) if delete_list is not None: for delete_this in delete_list: print( f'Usuario nao possui mais {delete_this} em sua conta, deletando...\n' ) session.query(Repository).filter( Repository.reponame == delete_this).delete() if insert_list is not None: for insert_this in insert_list: print( f'Novo repositorio pelo nome de {insert_this} encontrado, adicionando...\n' ) newadd = Repository(gituser, insert_this) session.add(newadd) session.commit()
def show_all_forums(): print("Forums at DB:" + database_config.HOST) print("-------------------------------------") print("forum_id | forum_link") repository = Repository.Repository() forums = repository.get_all_forums() for f in forums: print(str(f.forum_id) + " | " + f.link)
def get_texts_and_prepare_data_frame(date_from, date_to, forum_id): """ Perform SQL Query to database and turns the result into DataFrame """ repository = Repository.Repository() data = repository.get_posts(date_from, date_to, forum_id) data_frame = pd.DataFrame( data, columns=['post', 'post_date', 'topic_title', 'category']) return data_frame
def getRepo(ctx, repository): if repository not in ctx.obj['repositories']: print('[ERROR] The repository does not exist, %s' % repository, file=sys.stderr) sys.exit(1) try: mirror_path = os.path.realpath( os.path.join(ctx.obj['current_path'], ctx.obj['repositories'][repository]['mirror-path'])) except KeyError as err: print >> sys.stderr, '[ERROR] Incorrect repositories configuration file, repository: %s [%s]' % ( repository, err) sys.exit(1) repo = Repository(repository, mirror_path, **ctx.obj['repositories'][repository]) return repo
def __init__(self, repo_client=Repository(adapter=UserRepository)): self.repo_client = repo_client
languages = ", ".join(languages) await repository.get_or_create( external_id=repo["id"], name=repo["full_name"], languages=languages, owner_id=owner.id, ) # Waiting a bit to avoid too much requests print("Waiting 1 second before next request") await asyncio.sleep(1) await scrap_github(repository, user) async def fetch_data(url, session): async with session.get(url) as response: if response.status == 403 or response.status == 401: raise ValueError("API limit exceeded") return await response.json() if __name__ == "__main__": engine = create_engine(config.DB_URL) session = sessionmaker(bind=engine)() repository = Repository(session) user = User(session) asyncio.run(scrap_github(repository, user))
def __init__(self, repo_client=Repository(adapter=ReportRepository)): self.repo_client = repo_client self.platforms = ['osx', 'linux', 'win', 'win7']
def export_posts(forum_id, date_from, date_to, filename): repository = Repository.Repository() data = repository.get_posts(date_from, date_to, forum_id) df = pd.DataFrame(data, columns=['post', 'post_date', 'topic_title', 'category']) df.to_csv(filename, sep=';', escapechar='\\', encoding='utf8')
def __init__(self): self.repository = r.Repository() self.forum = None
def __init__(self): self.repository = r.Repository() self.strategy_initialized = False self.forum = None
def finish_strategy(self): repository = r.Repository() all_categories = repository.get_all_categories(self.forum) categories_data_frame = pd.DataFrame.from_records( [category.to_dict() for category in all_categories]) categories_data_frame.to_csv("config/categories.csv", sep=";")
def __init__(self, repo_client=Repository(adapter=SessionRepository)): self.repo_client = repo_client
def repository(dbsession): return Repository(dbsession)
class CategoriesSpider(scrapy.Spider): name = 'categories' repository = Repository.Repository() def __init__(self, start_url, scrap_mode, **kwargs): super().__init__(**kwargs) self.start_urls = [start_url] self.base_domain = start_url self.scrap_mode = scrap_mode self.logger_dbg = logging_util.get_logger("logs/debug") self.rule_provider = rp.RuleProvider() self.rule_provider.prepare_model() self.scraping_strategy = scraping_strategy_builder.get_strategy( scrap_mode) self.forum = self.scraping_strategy.init_strategy(self.base_domain) def parse(self, response): soup = BeautifulSoup(response.body, features="lxml") parent = response.meta.get('parent') for tag in self.rule_provider.possible_tags: elements_with_tag = soup.body.findAll(tag) for html_element in elements_with_tag: if html_util.element_has_css_class(html_element): predicted = self.rule_provider.predict( tag, html_element["class"]) yield from self.scraping_strategy.execute_strategy( html_element, parent, predicted, tag, self.rule_provider, self) def parse_categories(self, html_element, predicted, parent): """ Executes action of parsing the categories """ category = None """ Title found""" if predicted == self.rule_provider.get_mapping(m.category_title): link = html_element['href'] title = str(html_element.contents[0]) category = self.repository.save_category(title, link, parent, self.forum) self.logger_dbg.info(title + " " + self.base_domain + link) """ Unwrapping needed """ if predicted == self.rule_provider.get_mapping(m.category_whole): try: first_a_html_element_inside_whole = html_element.findAll( "a")[0] link = first_a_html_element_inside_whole['href'] title = str(first_a_html_element_inside_whole.contents[0]) category = self.repository.save_category( title, link, parent, self.forum) self.logger_dbg.info(title + " " + self.base_domain + link) except BaseException as e: self.logger_dbg.error(str(e)) self.logger_dbg.error("Can't find category inside: " + str(html_element)) if category is not None and html_util.url_not_from_other_domain( category.link, self.base_domain): yield scrapy.Request(url=build_link(self.base_domain, category.link), callback=self.parse, meta={'parent': category}) def parse_topics(self, html_element, parent): author = None date = None link = None title = None for tag in self.rule_provider.possible_tags_topics: elements_inside_tag = html_element.findAll(tag) for elem in elements_inside_tag: if html_util.element_has_css_class(elem): predicted = self.rule_provider.predict(tag, elem["class"]) if predicted == self.rule_provider.get_mapping( m.topic_title): title = elem.contents[0] link = elem['href'] self.logger_dbg.info(title + " " + link) if predicted == self.rule_provider.get_mapping(m.author): author = elem.contents[0] if predicted == self.rule_provider.get_mapping( m.topic_date): date = dpt.parse_date(elem.contents) """ Additional check english speaking invision """ time_tags = html_element.findAll("time") if len(time_tags) > 0: date = dpt.parse_english_date(time_tags[0].contents) link = html_element.findAll('a')[0]['href'] title = html_element.findAll('a')[0]['title'] if title is None or link is None: self.logger_dbg.info("Can't find topic inside: " + str(html_element)) return if not filtering.topic_meets_criterion(title, author, date): return topic = self.repository.save_topic(author, date, link, parent, title) self.logger_dbg.info("Scrapped topic: " + str(topic)) yield scrapy.Request(dont_filter=True, url=build_link(self.base_domain, topic.link), callback=self.parse, meta={'parent': topic}) def parse_posts(self, html_element, parent): """ Executes action of parsing the post """ self.logger_dbg.info("Parsing post of topic: " + parent.title) author = None date = None content = None """Go through all the possible tags that occur in posts and determine their function """ for tag in self.rule_provider.possible_tags_posts: elements_with_tag = html_element.findAll(tag) for elem in elements_with_tag: if html_util.element_has_css_class(elem): predicted = self.rule_provider.predict(tag, elem["class"]) if predicted == self.rule_provider.get_mapping( m.post_body): content = filtering.assign_new_value_if_changed_and_not_null( content, ppt.contents_to_plain_text(elem.contents)) if predicted == self.rule_provider.get_mapping(m.author): author = elem.contents[0] if predicted == self.rule_provider.get_mapping( m.post_date): date = dpt.parse_date(elem.contents) """Perform additional check for english format of date """ time_tags = html_element.findAll("time") if len(time_tags) > 0 and date is None: date = dpt.parse_english_date(time_tags[0].contents) if content is not None and filtering.post_meets_criterions( content, author, date): self.repository.save_post(author, content, date, parent) def go_to_next_page(self, html_element, parent, predicted): """ Executes action of going to the next page """ if predicted == self.rule_provider.get_mapping(m.next_page): try: first_a_html_element_inside_whole = html_element.findAll( "a")[0] link = first_a_html_element_inside_whole['href'] self.logger_dbg.info("Going to next page: " + str(parent) + " unwrapped url: " + link) yield scrapy.Request(url=build_link(self.base_domain, link), callback=self.parse, meta={'parent': parent}) except BaseException as e: self.logger_dbg.error("Couldn't go to next page of: " + str(parent) + " due to: " + str(e)) self.logger_dbg.error("Element that caused the problem: " + str(html_element)) elif predicted == self.rule_provider.get_mapping(m.next_page_link): self.logger_dbg.info("Going to next page: " + str(parent) + " url: " + html_element['href']) yield scrapy.Request(url=build_link(self.base_domain, html_element['href']), callback=self.parse, meta={'parent': parent}) def closed(self, reason): self.scraping_strategy.finish_strategy()