Ejemplo n.º 1
0
 def test_extract(self):
     with open('webarticle2text/fixtures/SAMPLE1.html', 'rb') as fin:
         raw = fin.read()
         ret = webarticle2text.extractFromHTML(raw)
     print(ret)
     self.assertTrue(ret.startswith('Python 3.6.0 is now available!'))
     self.assertTrue(ret.endswith('is now available for download on python.org.'))
Ejemplo n.º 2
0
 def test_extract(self):
     with open('webarticle2text/fixtures/SAMPLE1.html', 'rb') as fin:
         raw = fin.read()
         ret = webarticle2text.extractFromHTML(raw)
     print(ret)
     self.assertTrue(ret.startswith('Python 3.6.0 is now available!'))
     self.assertTrue(
         ret.endswith('is now available for download on python.org.'))
Ejemplo n.º 3
0
def article_summary(url):
  r = op.open(url)
  title = op.title()
  raw_html = r.read()
  clean_html = filter(lambda x : x in string.printable,
    raw_html)
  main_text = webarticle2text.extractFromHTML(clean_html)
  sentences = sent_detector.tokenize(main_text)
  # return "<body> <h1>%s</h1> <p><ol><li>%s</li></ol></p></body>" % (
  #  title,
  #  '</li><li>'.join(sentences))
  return "<body> <h1>%s</h1> <p>%s</p></body>" % (
    title,
    ' '.join(sentences))
Ejemplo n.º 4
0
def index(crawl_dir,index_dir,get_first_N=None):

    df_files = []; file_indexed_dict = {} ; index = None
    if os.path.isfile(index_dir + "/loogle_files.csv" ):
        df = pd.read_csv(index_dir + "/loogle_files.csv")
        df_files = df[['file','size']].values.tolist()
        file_indexed_dict = df.set_index('file').to_dict()['size']
        index = open_dir(index_dir)
    else:     
        pdf_schema = Schema(path = ID(stored=True),  
                            title = TEXT(stored=True), 
                            text = TEXT) 
        index = create_in(index_dir, pdf_schema)
        
    writer = index.writer() 

    # get all potential files to be indexed
    dirs, files = rsync.ls(crawl_dir)
    files = [(f.replace("\\","/"),size) for (f,size) in files ]
    files = [(f,size) for (f,size) in files if os.path.splitext(f)[1] in exts]

    files_crawled_dict = dict(files)
    tmp = {} # needed bcz cannot change file_indexed_dict while iterating
    for ff in file_indexed_dict:
        # remove file from index if file exists in index, but not on
        # file system
        if ff not in files_crawled_dict:
            print ff, 'removed'
            writer.delete_by_term('path', unicode(ff))
            tmp[ff] = file_indexed_dict[ff]
        elif files_crawled_dict[ff] != file_indexed_dict[ff]:
            # this is the only section we do not add to tmp this is
            # how I remove an updated file from my index dictionary so
            # its absence will be detected below, and will be freshly
            # reindexed
            print ff, 'size different update'
            writer.delete_by_term('path', unicode(ff))
        else:
            tmp[ff] = file_indexed_dict[ff]

    # put it back in
    file_indexed_dict = tmp
                
    if get_first_N: files = files[:get_first_N]
        
    for i,(file,size) in enumerate(files):
        if file in file_indexed_dict:
            print 'skipping', file
            continue
        print 'processing', file
        ext = os.path.splitext(file)[1]
        if ext == ".pdf" :
            cmd = pdfcmd % (file,os.environ['TEMP'])
            os.system(cmd)
        elif ext == ".djvu":
            cmd = djvucmd % (file,os.environ['TEMP'])
            os.system(cmd)
            os.system("unix2dos -7 %s/loog.txt" % os.environ['TEMP'])
        elif ext == ".html":
            with codecs.open(file, encoding='utf-8') as f:
                content = f.read()
            content = webarticle2text.extractFromHTML(content)
            fout = open("%s/loog.txt" % os.environ['TEMP'],"w")
            fout.write(content.encode("utf8"))
            fout.close()            
        elif ext == ".txt":
            shutil.copy(file, "%s/loog.txt" % os.environ['TEMP'])
            

        # turn the file name itself into content as well just in case,
        # if text conversion does not output anything, at least we can
        # use some information from the file name for search
        filename_as_content = os.path.basename(file).replace("_"," ").replace("-"," ")
        filename_as_content = filename_as_content.decode("latin-1")
        for x in exts: filename_as_content = filename_as_content.replace(x,"")
        filename_as_content += " "
        
        with codecs.open("%s/loog.txt" % os.environ['TEMP'], encoding='utf-8') as f:
            content = f.read()
        writer.add_document(path = unicode(file),
                            title = unicode(filename_as_content),
                            text = unicode(content))
        df_files.append([file, size])

    writer.commit() 
    df_files = pd.DataFrame(df_files,columns=['file','size'])
    df_files.to_csv(index_dir + "/loogle_files.csv",index=None)
Ejemplo n.º 5
0
 def extract_html_content(self, content=None):
     """Extract the most interesting content from the html"""
     if not content:
         content = self.content
     content = webarticle2text.extractFromHTML(content)
     return content
Ejemplo n.º 6
0
def index(crawl_dir, index_dir, get_first_N=None):

    df_files = []
    file_indexed_dict = {}
    index = None
    if os.path.isfile(index_dir + "/loogle_files.csv"):
        df = pd.read_csv(index_dir + "/loogle_files.csv")
        df_files = df[['file', 'size']].values.tolist()
        file_indexed_dict = df.set_index('file').to_dict()['size']
        index = open_dir(index_dir)
    else:
        pdf_schema = Schema(path=ID(stored=True),
                            title=TEXT(stored=True),
                            text=TEXT)
        index = create_in(index_dir, pdf_schema)

    writer = index.writer()

    # get all potential files to be indexed
    dirs, files = rsync.ls(crawl_dir)
    files = [(f.replace("\\", "/"), size) for (f, size) in files]
    files = [(f, size) for (f, size) in files
             if os.path.splitext(f)[1] in exts]

    files_crawled_dict = dict(files)
    tmp = {}  # needed bcz cannot change file_indexed_dict while iterating
    for ff in file_indexed_dict:
        # remove file from index if file exists in index, but not on
        # file system
        if ff not in files_crawled_dict:
            print ff, 'removed'
            writer.delete_by_term('path', unicode(ff))
            tmp[ff] = file_indexed_dict[ff]
        elif files_crawled_dict[ff] != file_indexed_dict[ff]:
            # this is the only section we do not add to tmp this is
            # how I remove an updated file from my index dictionary so
            # its absence will be detected below, and will be freshly
            # reindexed
            print ff, 'size different update'
            writer.delete_by_term('path', unicode(ff))
        else:
            tmp[ff] = file_indexed_dict[ff]

    # put it back in
    file_indexed_dict = tmp

    if get_first_N: files = files[:get_first_N]

    for i, (file, size) in enumerate(files):
        if file in file_indexed_dict:
            print 'skipping', file
            continue
        print 'processing', file
        ext = os.path.splitext(file)[1]
        if ext == ".pdf":
            cmd = pdfcmd % (file, os.environ['TEMP'])
            os.system(cmd)
        elif ext == ".djvu":
            cmd = djvucmd % (file, os.environ['TEMP'])
            os.system(cmd)
            os.system("unix2dos -7 %s/loog.txt" % os.environ['TEMP'])
        elif ext == ".html":
            with codecs.open(file, encoding='utf-8') as f:
                content = f.read()
            content = webarticle2text.extractFromHTML(content)
            fout = open("%s/loog.txt" % os.environ['TEMP'], "w")
            fout.write(content.encode("utf8"))
            fout.close()
        elif ext == ".txt":
            shutil.copy(file, "%s/loog.txt" % os.environ['TEMP'])

        # turn the file name itself into content as well just in case,
        # if text conversion does not output anything, at least we can
        # use some information from the file name for search
        filename_as_content = os.path.basename(file).replace("_", " ").replace(
            "-", " ")
        filename_as_content = filename_as_content.decode("latin-1")
        for x in exts:
            filename_as_content = filename_as_content.replace(x, "")
        filename_as_content += " "

        with codecs.open("%s/loog.txt" % os.environ['TEMP'],
                         encoding='utf-8') as f:
            content = f.read()
        writer.add_document(path=unicode(file),
                            title=unicode(filename_as_content),
                            text=unicode(content))
        df_files.append([file, size])

    writer.commit()
    df_files = pd.DataFrame(df_files, columns=['file', 'size'])
    df_files.to_csv(index_dir + "/loogle_files.csv", index=None)