Esempi in Python per html_reader, esempi in Python per html_reader.html_reader

Esempio n. 1

0

Mostra file

File: gcard_helper.py Progetto: robertej19/clas12ocr

def GCard_Entry(UserSubmissionID,unixtimestamp,url_dir):
  print("Gathering gcards from {0} ".format(url_dir))
  if not 'http' in url_dir: #== fs.gcard_default:
    utils.printer('Using gcard from /jlab/work')
    gcard_text_db = url_dir
    db_gcard_write(UserSubmissionID,unixtimestamp,gcard_text_db)
  elif 'http' in url_dir:
    utils.printer('Trying to download gcards from online repository')
    if '.gcard' in url_dir:
      utils.printer('Gcard URL name is: '+url_dir)
      gcard_text = html_reader.html_reader(url_dir,'')[0]#This returns a tuple, we need the contents of the tuple
      utils.printer2('HTML from gcard link is: {0}'.format(gcard_text))
      gcard_text_db = gcard_text.replace('"',"'")
      print("\t Gathered gcard '{0}'".format(url_dir))
      db_gcard_write(UserSubmissionID,unixtimestamp,gcard_text_db)
    else:
      raw_html, gcard_urls = html_reader.html_reader(url_dir,fs.gcard_identifying_text)
      if len(gcard_urls) == 0:
        print("No gcard files found (they must end in '{0}'). Is the online repository correct?".format(fs.gcard_identifying_text))
        exit()
      else:
        for url_ending in gcard_urls:
          utils.printer('Gcard URL name is: '+url_ending)
          gcard_text = html_reader.html_reader(url_dir+'/'+url_ending,'')[0]#This returns a tuple, we need the contents of the tuple
          utils.printer2('HTML from gcard link is: {0}'.format(gcard_text))
          gcard_text_db = gcard_text.replace('"',"'")
          print("\t Gathered gcard '{0}'".format(url_ending))
          db_gcard_write(UserSubmissionID,unixtimestamp,gcard_text_db)

  # I don't think this block can ever be reached 
  else:
    print('gcard not recognized as default option or online repository, please inspect scard')
    exit()

Esempio n. 2

0

Mostra file

File: lund_helper.py Progetto: mit-mc-clas12/utils

def Lund_Downloader(lund_url_base,
                    lund_download_dir,
                    lund_filename,
                    single_file=True):
    lund_content = ""
    try:
        #print("Trying to download {} file from {}".format(lund_filename,lund_url_base))
        full_lund_path = lund_url_base
        if not single_file:
            full_lund_path += "/" + lund_filename
        lund_raw_text = html_reader.html_reader(full_lund_path)[
            0]  #This returns a tuple, we need the contents of the tuple
        lund_raw_text = str(
            lund_raw_text
        )  #This might not be needed, converts from bytes to strings
        lund_content = lund_raw_text.replace(
            '"', "'"
        )  #This isn't strictly needed but SQLite can't read " into data fields, only ' characters
        #print("Downloaded {}".format(full_lund_path))
    except Exception as e:
        print("Unable to download lund file sucessfully.")
        print("The error encountered was: \n {}".format(e))
    if len(lund_content) > 0:
        try:
            #print("Trying to save {}".format(lund_filename))
            filename = lund_download_dir + "/" + lund_filename
            with open(filename, "a") as file:
                file.write(lund_content)
            #print("Saved {} to {}{}".format(lund_filename,lund_download_dir,lund_filename))
        except Exception as e:
            print("Unable to save lund file sucessfully.")
            print("The error encountered was: \n {}".format(e))

Esempio n. 3

0

Mostra file

def count_files(url_dir):
    """ 
    We need to know how many files are going 
    to be downloaded before we do this job.  This 
    is used in the queue system.

    Inputs:
    -------
    - url_dir (str) - Specifies the location of the 
    lund files. 

    Returns:
    --------
    - nfiles (int) - The number of files to be downloaded. 

    """
    lund_extensions = ['.dat', '.txt', '.lund']

    # A case used to work around not downloading for types 1/3
    if url_dir == "no_download":
        print('Not downloading files due to SCard type.')
        return 0

    # Case 3/4
    if 'http' in url_dir:

        # Single web file
        if any([ext in url_dir for ext in lund_extensions]):
            return 1

        # Web directory
        else:
            raw_html, lund_urls = html_reader.html_reader(
                url_dir, fs.lund_identifying_text)
            return len(lund_urls)

    # Case 1/2
    else:

        # Single local file
        if any([ext in url_dir for ext in lund_extensions]):
            return 1

        # Local directory, many files
        else:
            lund_files = glob.glob(url_dir + '*')
            return len(lund_files)

    # Something weird happened.
    return 0

Esempio n. 4

0

Mostra file

File: gcard_helper.py Progetto: robertej19/clas12ocr

def download_gcards(url):
    """Download the gcard, or gcards that are located 
    at the url provided.  

    Input: 
    ------ 
    url - Gcard URL provided in the scard.

    Returns: 
    --------
    gcards - List of gcards from collected from URL. 


    To Do: 
    ------
    - Add logging to replace utils.printer commands removed. 

    """
    gcards = [] 
    
    if '.gcard' in url:
        # This returns a tuple, we need the contents of the tuple
        gcard = html_reader.html_reader(url,'')[0]
        gcards.append(gcard)

    # There could be an online directory that contains gcards
    # specified by the scard, here we need to search for the 
    # gcards that it contains and add them to our list. 
    else:
        raw_html, gcard_urls = html_reader.html_reader(url, fs.gcard_identifying_text)

        for url_ending in gcard_urls:
            # This returns a tuple, we need the contents of the tuple
            gcard = html_reader.html_reader(url + '/' + url_ending, '')[0]
            gcards.append(gcard)

    return [g.replace('"',"'") for g in gcards]

Esempio n. 5

0

Mostra file

File: gcard_helper.py Progetto: robertej19/clas12simulations

def GCard_Entry(BatchID, unixtimestamp, url_dir):
    print("Gathering gcards from {0} ".format(url_dir))
    if url_dir == file_struct.gcard_default:
        utils.printer('Using gcard from /jlab/work')
        gcard_text_db = url_dir
        db_gcard_write(BatchID, unixtimestamp, gcard_text_db)
    elif 'https://' in url_dir:
        utils.printer('Trying to download gcards from online repository')
        raw_html, gcard_urls = html_reader.html_reader(
            url_dir, file_struct.gcard_identifying_text)
        for url_ending in gcard_urls:
            utils.printer('Gcard URL name is: ' + url_ending)
            gcard_text = html_reader.html_reader(
                url_dir + '/' + url_ending, ''
            )[0]  #This returns a tuple, we need the contents of the tuple
            utils.printer2('HTML from gcard link is: {0}'.format(gcard_text))
            gcard_text_db = gcard_text.replace('"', "'")
            print("\t Gathered gcard '{0}'".format(url_ending))
            db_gcard_write(BatchID, unixtimestamp, gcard_text_db)
    else:
        print(
            'gcard not recognized as default option or online repository, please inspect scard'
        )
        exit()

Esempio n. 6

0

Mostra file

def Lund_Downloader(url_dir, lund_urls, lund_dir):
    if len(lund_urls) == 0:
        print(
            "No Lund files found (they must end in '{0}'). Is the online repository correct?"
            .format(fs.lund_identifying_text))
        exit()
    else:
        for url_ending in lund_urls:
            utils.printer('Lund URL name is: ' + url_ending)
            lund_text = html_reader.html_reader(
                url_dir + '/' + url_ending, ''
            )[0]  #This returns a tuple, we need the contents of the tuple
            utils.printer2('HTML from lund link is: {0}'.format(lund_text))
            lund_text_db = lund_text.replace(
                '"', "'"
            )  #This isn't strictly needed but SQLite can't read " into data fields, only ' characters
            print("\t Gathered lund file '{0}'".format(url_ending))
            filename = lund_dir + "/" + url_ending
            with open(filename, "a") as file:
                file.write(lund_text_db)

Esempio n. 7

0

Mostra file

def Lund_Entry(url_dir, target_dir):
    """ Download or copy lund files and return the name of 
    the target directory. 

    Inputs: 
    -------
    url_dir - A string containing the directory or path to lund file(s).

    Returns: 
    --------
    lund_dir - A string containing the name of the downloaded directory.

    A few cases can occur: 

    1) One local file, extension will be .txt, .dat, or .lund and the string 
       will not contain http. 
    2) Several local files, no extension will be given.  The string will not 
       contain http. 
    3) One web file, extension will be .txt, .dat, or .lund and the string 
       will contain http. 
    4) Many web files, no extension will be given.  The string will contain http. 

    """

    lund_extensions = ['.dat', '.txt', '.lund']
    lund_dir = target_dir

    # A case used to work around not downloading for types 1/3
    if url_dir == "no_download":
        print('Not downloading files due to SCard type.')
        return lund_dir

    if os.path.exists(lund_dir):
        print('Lund directory already exists, not downloading again.')
        return lund_dir

    # Create dir.
    subprocess.call(['mkdir', '-p', lund_dir])

    # Case 3/4
    if 'http' in url_dir:

        # Single web file
        if any([ext in url_dir for ext in lund_extensions]):
            lund_dir_unformatted = url_dir.split("/")
            filename = lund_dir_unformatted[len(lund_dir_unformatted) - 1]

            lund_text = html_reader.html_reader(url_dir, '')[
                0]  #This returns a tuple, we need the contents of the tuple
            utils.printer2('HTML from lund link is: {0}'.format(lund_text))
            lund_text_db = lund_text.replace(
                '"', "'"
            )  #This isn't strictly needed but SQLite can't read " into data fields, only ' characters
            print("\t Gathered lund file '{0}'".format(url_dir))
            with open(lund_dir + "/" + filename, "a") as file:
                file.write(lund_text_db)

        # Web directory
        else:
            raw_html, lund_urls = html_reader.html_reader(
                url_dir, fs.lund_identifying_text)
            lund_dir_unformatted = url_dir.split("//")[1]
            subprocess.call(['mkdir', '-p', lund_dir])
            Lund_Downloader(url_dir, lund_urls, lund_dir)

    # Case 1/2
    else:

        # Single local file
        if any([ext in url_dir for ext in lund_extensions]):
            subprocess.call(['cp', url_dir, lund_dir + '/'])

        # Local directory, many files
        else:
            print('Downloading all files in {}'.format(url_dir))

            lund_files = glob.glob(url_dir + '*')
            print(lund_files)

            for lf in lund_files:
                if any([ext in lf for ext in lund_extensions]):
                    subprocess.call(['cp', lf, lund_dir + '/'])

    return lund_dir

Esempio n. 8

0

Mostra file

File: lund_helper.py Progetto: mit-mc-clas12/utils

def Lund_Entry(lund_location, lund_download_dir="lund_dir/"):
    valid_lund_extensions = ['.dat', '.txt', '.lund']

    #Make sure lund_download_dir ends with a /, and if not, add one.
    if lund_download_dir[-1] is not "/":
        lund_download_dir += "/"

    # A case used to work around not downloading for types 1/3
    if lund_location == "no_download":
        print('Not downloading files due to SCard type.')
        return lund_location
    elif os.path.exists(lund_download_dir):
        print('Lund directory already exists, not downloading again.')
        return lund_download_dir

    # Create dir. to download / copy files into
    try:
        subprocess.call(['mkdir', '-p', lund_download_dir])
    except Exception as e:
        print("WARNING: unable to make directory {}".format(lund_download_dir))
        print("The error encountered was: \n {}".format(e))

    ##################################################################
    # Case 3/4 - download single / multiple files from online location
    ##################################################################
    if 'http' in lund_location:
        # Download single web file
        if any([ext in lund_location for ext in valid_lund_extensions]):
            lund_dir_unformatted = lund_location.split("/")
            lund_filename = lund_dir_unformatted[
                -1]  # the gets the name of the lund file, assuming the format is http......./lund_file_name
            #Pass the location, file, and download dir to the downloader function
            Lund_Downloader(lund_url_base=lund_location,
                            lund_download_dir=lund_download_dir,
                            lund_filename=lund_filename)
        # Download entire web directory
        else:
            try:
                #Read the given location to find all the lund files
                raw_html, lund_filenames = html_reader.html_reader(
                    lund_location, valid_lund_extensions)
            except Exception as e:
                print("ERROR: unable to download lund files from {}".format(
                    lund_location))
                print("The error encountered was: \n {}".format(e))
                exit()

            if len(lund_filenames) == 0:
                print(
                    "No Lund files found (they must end in '{}'). Is the online repository correct?"
                    .format(valid_lund_extensions))
                exit()
            #Loop through downloading every LUND file in directory
            for lund_filename in lund_filenames:
                Lund_Downloader(lund_url_base=lund_location,
                                lund_download_dir=lund_download_dir,
                                lund_filename=lund_filename,
                                single_file=False)

    #######################################################################
    # Case 1/2 - Use RSync to copy files from a jlab location to OSG
    # RSYNC option: rlpgoD replaces -a (rlptgoD) so time is not preserved:
    # When copied, the files will have a new timestamp, which will play
    # nice with our autodeletion cronjobs
    ######################################################################
    else:
        # Single local file
        if any([ext in lund_location for ext in valid_lund_extensions]):
            try:
                #print("Trying to copy Lund file from {}".format(lund_location))
                if lund_location[0] is not "/":
                    lund_location = "/" + lund_location

                #Example full filepath: gemc@dtn1902-ib:/lustre19/expphy/volatile/clas12/robertej/testlund.txt
                lund_copy_path = 'gemc@dtn1902-ib:/lustre19/expphy' + lund_location
                subprocess.call(
                    ['rsync', '-rlpgoD', lund_copy_path, lund_download_dir])
            except Exception as e:
                print("ERROR: unable to copy lund files from {}".format(
                    lund_location))
                print("The error encountered was: \n {}".format(e))
        # Local directory, many files
        else:
            if lund_location[0] is not "/":
                lund_location = "/" + lund_location
            if lund_location[-1] is not "/":
                lund_location += "/"
            if "/lustre19/expphy" not in lund_location:
                lund_location = '/lustre19/expphy' + lund_location
            #print("trying to rsync {}".format(lund_location))
            lund_copy_path = 'gemc@dtn1902-ib:' + lund_location

            #subprocess.call(['rsync', '-a', lund_copy_path, lund_download_dir])
            subprocess.call([
                'rsync', '-zrlpgoDv', '--prune-empty-dirs',
                "--include='*.dat'", "--include='*.txt'", "--exclude='*'",
                lund_copy_path, lund_download_dir
            ])

            files = os.listdir(lund_download_dir)
            for f in files:
                if not any([ext in f for ext in valid_lund_extensions]):
                    os.remove(lund_download_dir + f)

    return lund_download_dir

Esempio n. 9

0

Mostra file

def index():
    papers = [html_reader('cacm/CACM-%s.html' % convert(i)) for i in num]
    years = set()
    for item in papers:
        years.update((item.metadata[6], item.date[0]))
    years = list(years)
    index_year = {year: set() for year in years}
    for item in papers:
        index_year[item.date[0]].add(item.id)
        index_year[item.metadata[6]].add(item.id)
    for key in index_year:
        index_year[key] = np.sort(np.array(list(index_year[key])))

    year_file = h5py.File("year_index.h5", 'w')
    for key in index_year:
        year_file.create_dataset(name=key, data=index_year[key])
    year_file.close()

    months = set()
    for item in papers:
        months.update((item.date[1], item.metadata[2]))
    months = list(months)
    index_month = {month: set() for month in months}
    for item in papers:
        index_month[item.date[1]].add(item.id)
        index_month[item.metadata[2]].add(item.id)
    for key in index_month:
        index_month[key] = np.sort(np.array(list(index_month[key])))

    month_file = h5py.File("month_index.h5", 'w')
    for key in index_month:
        month_file.create_dataset(name=key, data=index_month[key])
    month_file.close()

    ids = set()
    for item in papers:
        ids.add(item.metadata[0])
    ids = list(ids)
    index_id = {id: set() for id in ids}
    for item in papers:
        index_id[item.metadata[0]].add(item.id)
    for key in index_id:
        index_id[key] = np.sort(np.array(list(index_id[key])))
    id_file = h5py.File("id_index.h5", 'w')
    for key in index_id:
        id_file.create_dataset(name=key, data=index_id[key])
    id_file.close()

    sigs = set()
    for item in papers:
        sigs.add(item.metadata[1])
    sigs = list(sigs)
    index_sig = {sig: set() for sig in sigs}
    for item in papers:
        index_sig[item.metadata[1]].add(item.id)
    for key in index_sig:
        index_sig[key] = np.sort(np.array(list(index_sig[key])))
    sig_file = h5py.File("sig_index.h5", "w")
    for key in index_sig:
        sig_file.create_dataset(name=key, data=index_sig[key])
    sig_file.close()

    times = set()
    for item in papers:
        times.add(item.metadata[4] + " " + item.metadata[5])
    times = list(times)
    index_time = {time: set() for time in times}
    for item in papers:
        index_time[item.metadata[4] + " " + item.metadata[5]].add(item.id)
    for key in index_time:
        index_time[key] = np.sort(np.array(list(index_time[key])))
    time_file = h5py.File("time_index.h5", "w")
    for key in index_time:
        time_file.create_dataset(name=key, data=index_time[key])
    time_file.close()

    authors = set()
    for item in papers:
        authors.update(item.author)
    authors = list(authors)
    index_author = {author: set() for author in authors}
    for item in papers:
        for a in item.author:
            index_author[a].add(item.id)
    for key in index_author:
        index_author[key] = np.sort(np.array(list(index_author[key])))
    author_file = h5py.File("author_index.h5", "w")
    for key in index_author:
        author_file.create_dataset(name=key, data=index_author[key])
    author_file.close()

    dates = set()
    for item in papers:
        dates.add(item.metadata[2] + " " + item.metadata[3])
    dates = list(dates)
    index_date = {date: set() for date in dates}
    for item in papers:
        index_date[item.metadata[2] + " " + item.metadata[3]].add(item.id)
    for key in index_date:
        index_date[key] = np.sort(np.array(list(index_date[key])))
    date_file = h5py.File("date_index.h5", "w")
    for key in index_date:
        date_file.create_dataset(name=key, data=index_date[key])
    author_file.close()

    words = set()
    for item in papers:
        words.update(item.corpus)
    words = list(words)
    index_word = {word: set() for word in words}
    for item in papers:
        for word in item.corpus:
            index_word[word].add(item.id)
    for key in index_word:
        index_word[key] = np.sort(np.array(list(index_word[key])))
    word_file = h5py.File("word_index.h5", "w")
    for key in index_word:
        word_file.create_dataset(name=key, data=index_word[key])
    word_file.close()