Example #1
0
 def link(self):
   if self.model == Commit:
     return join(DOMAIN_NAME, reverse(views.commits.show_repository,
                                      args = (self.project.url_path,)))
   else:
     return join(DOMAIN_NAME, reverse(views.blogs.show_blog,
                                 args = (self.project.url_path,)))
def scrap_preceeding(base_url):
    homepage_html_content = web.download(base_url)
    homepage_soup = bsoup(homepage_html_content)
    ul_content = homepage_soup.find_all('ul')
    a_content = bsoup(str(ul_content)).find_all('a')
    volume_page_links = []
    for raw_link in a_content:
        volume_page_links.append(join(base_url, raw_link.get('href'))+'/')


    os.chdir('/home/sorkhei/Desktop/LDA-Papers/JMLR/Preceedings/')

    for base_link in volume_page_links[32:]:
        folder_name = base_link.split('/')[-2]
        address = os.path.join(os.getcwd(), folder_name)
        if not os.path.exists(address):
            os.mkdir(folder_name)
        else:
            index = 1
            while os.path.exists(address):
                folder_name = base_link.split('/')[-2] + '-' + str(index)
                print folder_name
                address = os.path.join(os.getcwd(), folder_name)
                index += 1
            os.mkdir(folder_name)

        os.chdir(address)


        print '--------------'
        print 'downloading from ' + base_link
        volume_content_soup = bsoup(web.download(base_link)).find_all('div', {'id': 'content'})
        a_content = bsoup(str(volume_content_soup)).find_all('a')
        # print a_content
        pdf_links = [join(base_link, link.get('href')) for link in a_content if str(link.get('href')).endswith('pdf')]
        for download_link in pdf_links:
            if not download_link.endswith('supp.pdf'):
                try:
                    content = web.download(download_link)
                except:
                    print 'link : %s is obsolete' % download_link
                    continue
                f = open(download_link.split('/')[-1], 'wb')
                f.write(content)
                f.close()
        os.chdir('/home/sorkhei/Desktop/LDA-Papers/JMLR/Preceedings/')
Example #3
0
def build_database(start_url, debug=False):
    urls = [start_url]
    emails = []
    for url in urls:
        try:
            if debug: print url
            soup = BeautifulSoup(get(url).read())
            for link in soup.findAll('a', href=True):
                if link['href'].startswith('mailto:'):
                    emails.append(link['href'].replace('mailto:', ''))
                    if debug: print "[!!] Email: "+link['href']
                else:
                    u = join(url, link['href'])
                    urls.append(u)
                    if debug: print u
            urls = list(set(urls))
            emails = list(set(emails))
        except Exception as e:
            #print "[!!!!!] EXCEPTION: "+str(e)
            #return list(emails)
            pass
    return list(emails)
Example #4
0
 def item_link(self, event):
   return join(DOMAIN_NAME, reverse(views.feed.event,
                                    args = (event.url_path,)))
Example #5
0
 def link(self):
   if self.project is not None:
     return join(DOMAIN_NAME, reverse(views.blogs.show_blog,
                                 args = (self.project.url_path,)))
   else:
     return join(DOMAIN_NAME, reverse(views.blogs.posts))
Example #6
0
 def link(self):
   if self.project is not None:
     return join(DOMAIN_NAME, reverse(views.commits.show_repository,
                                 args = (self.project.url_path,)))
   else:
     return join(DOMAIN_NAME, reverse(views.commits.all))
Example #7
0
 def link(self):
   if self.project is not None:
     return join(DOMAIN_NAME, reverse(views.projects.show,
                                 args = (self.project.url_path,)))
   else:
     return join(DOMAIN_NAME, reverse(views.feed.feed))