Ejemplo n.º 1
0
def updateDataBase(token):
    repository = Repository(token)
    print("Get Java Repositories")
    repositoriesJava = repository.get_repositories("java", 100)
    print("Get Python Repositories")
    repositoriesPython = repository.get_repositories("python", 100)
    repositories = repositoriesJava + repositoriesPython
    RepositoryData().update(repositories)
Ejemplo n.º 2
0
def insert_repo_and_user(gituser, repolist):
    Session = sessionmaker(bind=engine)
    session = Session()
    raw_repos = session.query(Repository).filter(
        Repository.username == gituser).all()
    aquired_repos = []
    print('Repositorios ja armazenados deste usuario:')
    for repo in raw_repos:
        print(f'{repo.username}:{repo.reponame}')
        aquired_repos.append(repo.reponame)

    print(f'\nRepositorios do usuario adiquiridos no git: {repolist}\n')
    delete_list = select_delete_candidates(aquired_repos, repolist)
    insert_list = select_insert_candidates(aquired_repos, repolist)
    #print(delete_list)
    #print(insert_list)
    if delete_list is not None:
        for delete_this in delete_list:
            print(
                f'Usuario nao possui mais {delete_this} em sua conta, deletando...\n'
            )
            session.query(Repository).filter(
                Repository.reponame == delete_this).delete()
    if insert_list is not None:
        for insert_this in insert_list:
            print(
                f'Novo repositorio pelo nome de {insert_this} encontrado, adicionando...\n'
            )
            newadd = Repository(gituser, insert_this)
            session.add(newadd)
    session.commit()
Ejemplo n.º 3
0
def show_all_forums():
    print("Forums at DB:" + database_config.HOST)
    print("-------------------------------------")
    print("forum_id | forum_link")
    repository = Repository.Repository()
    forums = repository.get_all_forums()
    for f in forums:
        print(str(f.forum_id) + " | " + f.link)
Ejemplo n.º 4
0
def get_texts_and_prepare_data_frame(date_from, date_to, forum_id):
    """
    Perform SQL Query to database and turns the result into DataFrame
    """
    repository = Repository.Repository()
    data = repository.get_posts(date_from, date_to, forum_id)
    data_frame = pd.DataFrame(
        data, columns=['post', 'post_date', 'topic_title', 'category'])
    return data_frame
Ejemplo n.º 5
0
def getRepo(ctx, repository):
    if repository not in ctx.obj['repositories']:
        print('[ERROR] The repository does not exist, %s' % repository,
              file=sys.stderr)
        sys.exit(1)
    try:
        mirror_path = os.path.realpath(
            os.path.join(ctx.obj['current_path'],
                         ctx.obj['repositories'][repository]['mirror-path']))
    except KeyError as err:
        print >> sys.stderr, '[ERROR] Incorrect repositories configuration file, repository: %s [%s]' % (
            repository, err)
        sys.exit(1)
    repo = Repository(repository, mirror_path,
                      **ctx.obj['repositories'][repository])
    return repo
Ejemplo n.º 6
0
 def __init__(self, repo_client=Repository(adapter=UserRepository)):
     self.repo_client = repo_client
            languages = ", ".join(languages)

            await repository.get_or_create(
                external_id=repo["id"],
                name=repo["full_name"],
                languages=languages,
                owner_id=owner.id,
            )

    # Waiting a bit to avoid too much requests
    print("Waiting 1 second before next request")
    await asyncio.sleep(1)
    await scrap_github(repository, user)


async def fetch_data(url, session):
    async with session.get(url) as response:
        if response.status == 403 or response.status == 401:
            raise ValueError("API limit exceeded")
        return await response.json()


if __name__ == "__main__":
    engine = create_engine(config.DB_URL)
    session = sessionmaker(bind=engine)()

    repository = Repository(session)
    user = User(session)

    asyncio.run(scrap_github(repository, user))
Ejemplo n.º 8
0
 def __init__(self, repo_client=Repository(adapter=ReportRepository)):
     self.repo_client = repo_client
     self.platforms = ['osx', 'linux', 'win', 'win7']
Ejemplo n.º 9
0
def export_posts(forum_id, date_from, date_to, filename):
    repository = Repository.Repository()
    data = repository.get_posts(date_from, date_to, forum_id)
    df = pd.DataFrame(data,
                      columns=['post', 'post_date', 'topic_title', 'category'])
    df.to_csv(filename, sep=';', escapechar='\\', encoding='utf8')
Ejemplo n.º 10
0
 def __init__(self):
     self.repository = r.Repository()
     self.forum = None
Ejemplo n.º 11
0
 def __init__(self):
     self.repository = r.Repository()
     self.strategy_initialized = False
     self.forum = None
 def finish_strategy(self):
     repository = r.Repository()
     all_categories = repository.get_all_categories(self.forum)
     categories_data_frame = pd.DataFrame.from_records(
         [category.to_dict() for category in all_categories])
     categories_data_frame.to_csv("config/categories.csv", sep=";")
Ejemplo n.º 13
0
 def __init__(self, repo_client=Repository(adapter=SessionRepository)):
     self.repo_client = repo_client
def repository(dbsession):
    return Repository(dbsession)
Ejemplo n.º 15
0
class CategoriesSpider(scrapy.Spider):
    name = 'categories'
    repository = Repository.Repository()

    def __init__(self, start_url, scrap_mode, **kwargs):
        super().__init__(**kwargs)
        self.start_urls = [start_url]
        self.base_domain = start_url
        self.scrap_mode = scrap_mode
        self.logger_dbg = logging_util.get_logger("logs/debug")
        self.rule_provider = rp.RuleProvider()
        self.rule_provider.prepare_model()
        self.scraping_strategy = scraping_strategy_builder.get_strategy(
            scrap_mode)
        self.forum = self.scraping_strategy.init_strategy(self.base_domain)

    def parse(self, response):
        soup = BeautifulSoup(response.body, features="lxml")
        parent = response.meta.get('parent')
        for tag in self.rule_provider.possible_tags:
            elements_with_tag = soup.body.findAll(tag)
            for html_element in elements_with_tag:
                if html_util.element_has_css_class(html_element):
                    predicted = self.rule_provider.predict(
                        tag, html_element["class"])
                    yield from self.scraping_strategy.execute_strategy(
                        html_element, parent, predicted, tag,
                        self.rule_provider, self)

    def parse_categories(self, html_element, predicted, parent):
        """
        Executes action of parsing the categories
        """
        category = None
        """ Title found"""
        if predicted == self.rule_provider.get_mapping(m.category_title):
            link = html_element['href']
            title = str(html_element.contents[0])
            category = self.repository.save_category(title, link, parent,
                                                     self.forum)
            self.logger_dbg.info(title + " " + self.base_domain + link)
        """ Unwrapping needed """
        if predicted == self.rule_provider.get_mapping(m.category_whole):
            try:
                first_a_html_element_inside_whole = html_element.findAll(
                    "a")[0]
                link = first_a_html_element_inside_whole['href']
                title = str(first_a_html_element_inside_whole.contents[0])
                category = self.repository.save_category(
                    title, link, parent, self.forum)
                self.logger_dbg.info(title + " " + self.base_domain + link)
            except BaseException as e:
                self.logger_dbg.error(str(e))
                self.logger_dbg.error("Can't find category inside: " +
                                      str(html_element))

        if category is not None and html_util.url_not_from_other_domain(
                category.link, self.base_domain):
            yield scrapy.Request(url=build_link(self.base_domain,
                                                category.link),
                                 callback=self.parse,
                                 meta={'parent': category})

    def parse_topics(self, html_element, parent):
        author = None
        date = None
        link = None
        title = None
        for tag in self.rule_provider.possible_tags_topics:
            elements_inside_tag = html_element.findAll(tag)
            for elem in elements_inside_tag:
                if html_util.element_has_css_class(elem):
                    predicted = self.rule_provider.predict(tag, elem["class"])
                    if predicted == self.rule_provider.get_mapping(
                            m.topic_title):
                        title = elem.contents[0]
                        link = elem['href']
                        self.logger_dbg.info(title + " " + link)
                    if predicted == self.rule_provider.get_mapping(m.author):
                        author = elem.contents[0]
                    if predicted == self.rule_provider.get_mapping(
                            m.topic_date):
                        date = dpt.parse_date(elem.contents)
        """ Additional check english speaking invision """
        time_tags = html_element.findAll("time")
        if len(time_tags) > 0:
            date = dpt.parse_english_date(time_tags[0].contents)
            link = html_element.findAll('a')[0]['href']
            title = html_element.findAll('a')[0]['title']

        if title is None or link is None:
            self.logger_dbg.info("Can't find topic inside: " +
                                 str(html_element))
            return

        if not filtering.topic_meets_criterion(title, author, date):
            return
        topic = self.repository.save_topic(author, date, link, parent, title)
        self.logger_dbg.info("Scrapped topic: " + str(topic))
        yield scrapy.Request(dont_filter=True,
                             url=build_link(self.base_domain, topic.link),
                             callback=self.parse,
                             meta={'parent': topic})

    def parse_posts(self, html_element, parent):
        """
        Executes action of parsing the post
        """
        self.logger_dbg.info("Parsing post of topic: " + parent.title)
        author = None
        date = None
        content = None
        """Go through all the possible tags that occur in posts and determine their function """
        for tag in self.rule_provider.possible_tags_posts:
            elements_with_tag = html_element.findAll(tag)
            for elem in elements_with_tag:
                if html_util.element_has_css_class(elem):
                    predicted = self.rule_provider.predict(tag, elem["class"])
                    if predicted == self.rule_provider.get_mapping(
                            m.post_body):
                        content = filtering.assign_new_value_if_changed_and_not_null(
                            content, ppt.contents_to_plain_text(elem.contents))
                    if predicted == self.rule_provider.get_mapping(m.author):
                        author = elem.contents[0]
                    if predicted == self.rule_provider.get_mapping(
                            m.post_date):
                        date = dpt.parse_date(elem.contents)
        """Perform additional check for english format of date """
        time_tags = html_element.findAll("time")
        if len(time_tags) > 0 and date is None:
            date = dpt.parse_english_date(time_tags[0].contents)
        if content is not None and filtering.post_meets_criterions(
                content, author, date):
            self.repository.save_post(author, content, date, parent)

    def go_to_next_page(self, html_element, parent, predicted):
        """
        Executes action of going to the next page
        """
        if predicted == self.rule_provider.get_mapping(m.next_page):
            try:
                first_a_html_element_inside_whole = html_element.findAll(
                    "a")[0]
                link = first_a_html_element_inside_whole['href']
                self.logger_dbg.info("Going to next page: " + str(parent) +
                                     " unwrapped url: " + link)
                yield scrapy.Request(url=build_link(self.base_domain, link),
                                     callback=self.parse,
                                     meta={'parent': parent})
            except BaseException as e:
                self.logger_dbg.error("Couldn't go to next page of: " +
                                      str(parent) + " due to: " + str(e))
                self.logger_dbg.error("Element that caused the problem: " +
                                      str(html_element))
        elif predicted == self.rule_provider.get_mapping(m.next_page_link):
            self.logger_dbg.info("Going to next page: " + str(parent) +
                                 " url: " + html_element['href'])
            yield scrapy.Request(url=build_link(self.base_domain,
                                                html_element['href']),
                                 callback=self.parse,
                                 meta={'parent': parent})

    def closed(self, reason):
        self.scraping_strategy.finish_strategy()