def GCard_Entry(UserSubmissionID,unixtimestamp,url_dir): print("Gathering gcards from {0} ".format(url_dir)) if not 'http' in url_dir: #== fs.gcard_default: utils.printer('Using gcard from /jlab/work') gcard_text_db = url_dir db_gcard_write(UserSubmissionID,unixtimestamp,gcard_text_db) elif 'http' in url_dir: utils.printer('Trying to download gcards from online repository') if '.gcard' in url_dir: utils.printer('Gcard URL name is: '+url_dir) gcard_text = html_reader.html_reader(url_dir,'')[0]#This returns a tuple, we need the contents of the tuple utils.printer2('HTML from gcard link is: {0}'.format(gcard_text)) gcard_text_db = gcard_text.replace('"',"'") print("\t Gathered gcard '{0}'".format(url_dir)) db_gcard_write(UserSubmissionID,unixtimestamp,gcard_text_db) else: raw_html, gcard_urls = html_reader.html_reader(url_dir,fs.gcard_identifying_text) if len(gcard_urls) == 0: print("No gcard files found (they must end in '{0}'). Is the online repository correct?".format(fs.gcard_identifying_text)) exit() else: for url_ending in gcard_urls: utils.printer('Gcard URL name is: '+url_ending) gcard_text = html_reader.html_reader(url_dir+'/'+url_ending,'')[0]#This returns a tuple, we need the contents of the tuple utils.printer2('HTML from gcard link is: {0}'.format(gcard_text)) gcard_text_db = gcard_text.replace('"',"'") print("\t Gathered gcard '{0}'".format(url_ending)) db_gcard_write(UserSubmissionID,unixtimestamp,gcard_text_db) # I don't think this block can ever be reached else: print('gcard not recognized as default option or online repository, please inspect scard') exit()
def Lund_Downloader(lund_url_base, lund_download_dir, lund_filename, single_file=True): lund_content = "" try: #print("Trying to download {} file from {}".format(lund_filename,lund_url_base)) full_lund_path = lund_url_base if not single_file: full_lund_path += "/" + lund_filename lund_raw_text = html_reader.html_reader(full_lund_path)[ 0] #This returns a tuple, we need the contents of the tuple lund_raw_text = str( lund_raw_text ) #This might not be needed, converts from bytes to strings lund_content = lund_raw_text.replace( '"', "'" ) #This isn't strictly needed but SQLite can't read " into data fields, only ' characters #print("Downloaded {}".format(full_lund_path)) except Exception as e: print("Unable to download lund file sucessfully.") print("The error encountered was: \n {}".format(e)) if len(lund_content) > 0: try: #print("Trying to save {}".format(lund_filename)) filename = lund_download_dir + "/" + lund_filename with open(filename, "a") as file: file.write(lund_content) #print("Saved {} to {}{}".format(lund_filename,lund_download_dir,lund_filename)) except Exception as e: print("Unable to save lund file sucessfully.") print("The error encountered was: \n {}".format(e))
def count_files(url_dir): """ We need to know how many files are going to be downloaded before we do this job. This is used in the queue system. Inputs: ------- - url_dir (str) - Specifies the location of the lund files. Returns: -------- - nfiles (int) - The number of files to be downloaded. """ lund_extensions = ['.dat', '.txt', '.lund'] # A case used to work around not downloading for types 1/3 if url_dir == "no_download": print('Not downloading files due to SCard type.') return 0 # Case 3/4 if 'http' in url_dir: # Single web file if any([ext in url_dir for ext in lund_extensions]): return 1 # Web directory else: raw_html, lund_urls = html_reader.html_reader( url_dir, fs.lund_identifying_text) return len(lund_urls) # Case 1/2 else: # Single local file if any([ext in url_dir for ext in lund_extensions]): return 1 # Local directory, many files else: lund_files = glob.glob(url_dir + '*') return len(lund_files) # Something weird happened. return 0
def download_gcards(url): """Download the gcard, or gcards that are located at the url provided. Input: ------ url - Gcard URL provided in the scard. Returns: -------- gcards - List of gcards from collected from URL. To Do: ------ - Add logging to replace utils.printer commands removed. """ gcards = [] if '.gcard' in url: # This returns a tuple, we need the contents of the tuple gcard = html_reader.html_reader(url,'')[0] gcards.append(gcard) # There could be an online directory that contains gcards # specified by the scard, here we need to search for the # gcards that it contains and add them to our list. else: raw_html, gcard_urls = html_reader.html_reader(url, fs.gcard_identifying_text) for url_ending in gcard_urls: # This returns a tuple, we need the contents of the tuple gcard = html_reader.html_reader(url + '/' + url_ending, '')[0] gcards.append(gcard) return [g.replace('"',"'") for g in gcards]
def GCard_Entry(BatchID, unixtimestamp, url_dir): print("Gathering gcards from {0} ".format(url_dir)) if url_dir == file_struct.gcard_default: utils.printer('Using gcard from /jlab/work') gcard_text_db = url_dir db_gcard_write(BatchID, unixtimestamp, gcard_text_db) elif 'https://' in url_dir: utils.printer('Trying to download gcards from online repository') raw_html, gcard_urls = html_reader.html_reader( url_dir, file_struct.gcard_identifying_text) for url_ending in gcard_urls: utils.printer('Gcard URL name is: ' + url_ending) gcard_text = html_reader.html_reader( url_dir + '/' + url_ending, '' )[0] #This returns a tuple, we need the contents of the tuple utils.printer2('HTML from gcard link is: {0}'.format(gcard_text)) gcard_text_db = gcard_text.replace('"', "'") print("\t Gathered gcard '{0}'".format(url_ending)) db_gcard_write(BatchID, unixtimestamp, gcard_text_db) else: print( 'gcard not recognized as default option or online repository, please inspect scard' ) exit()
def Lund_Downloader(url_dir, lund_urls, lund_dir): if len(lund_urls) == 0: print( "No Lund files found (they must end in '{0}'). Is the online repository correct?" .format(fs.lund_identifying_text)) exit() else: for url_ending in lund_urls: utils.printer('Lund URL name is: ' + url_ending) lund_text = html_reader.html_reader( url_dir + '/' + url_ending, '' )[0] #This returns a tuple, we need the contents of the tuple utils.printer2('HTML from lund link is: {0}'.format(lund_text)) lund_text_db = lund_text.replace( '"', "'" ) #This isn't strictly needed but SQLite can't read " into data fields, only ' characters print("\t Gathered lund file '{0}'".format(url_ending)) filename = lund_dir + "/" + url_ending with open(filename, "a") as file: file.write(lund_text_db)
def Lund_Entry(url_dir, target_dir): """ Download or copy lund files and return the name of the target directory. Inputs: ------- url_dir - A string containing the directory or path to lund file(s). Returns: -------- lund_dir - A string containing the name of the downloaded directory. A few cases can occur: 1) One local file, extension will be .txt, .dat, or .lund and the string will not contain http. 2) Several local files, no extension will be given. The string will not contain http. 3) One web file, extension will be .txt, .dat, or .lund and the string will contain http. 4) Many web files, no extension will be given. The string will contain http. """ lund_extensions = ['.dat', '.txt', '.lund'] lund_dir = target_dir # A case used to work around not downloading for types 1/3 if url_dir == "no_download": print('Not downloading files due to SCard type.') return lund_dir if os.path.exists(lund_dir): print('Lund directory already exists, not downloading again.') return lund_dir # Create dir. subprocess.call(['mkdir', '-p', lund_dir]) # Case 3/4 if 'http' in url_dir: # Single web file if any([ext in url_dir for ext in lund_extensions]): lund_dir_unformatted = url_dir.split("/") filename = lund_dir_unformatted[len(lund_dir_unformatted) - 1] lund_text = html_reader.html_reader(url_dir, '')[ 0] #This returns a tuple, we need the contents of the tuple utils.printer2('HTML from lund link is: {0}'.format(lund_text)) lund_text_db = lund_text.replace( '"', "'" ) #This isn't strictly needed but SQLite can't read " into data fields, only ' characters print("\t Gathered lund file '{0}'".format(url_dir)) with open(lund_dir + "/" + filename, "a") as file: file.write(lund_text_db) # Web directory else: raw_html, lund_urls = html_reader.html_reader( url_dir, fs.lund_identifying_text) lund_dir_unformatted = url_dir.split("//")[1] subprocess.call(['mkdir', '-p', lund_dir]) Lund_Downloader(url_dir, lund_urls, lund_dir) # Case 1/2 else: # Single local file if any([ext in url_dir for ext in lund_extensions]): subprocess.call(['cp', url_dir, lund_dir + '/']) # Local directory, many files else: print('Downloading all files in {}'.format(url_dir)) lund_files = glob.glob(url_dir + '*') print(lund_files) for lf in lund_files: if any([ext in lf for ext in lund_extensions]): subprocess.call(['cp', lf, lund_dir + '/']) return lund_dir
def Lund_Entry(lund_location, lund_download_dir="lund_dir/"): valid_lund_extensions = ['.dat', '.txt', '.lund'] #Make sure lund_download_dir ends with a /, and if not, add one. if lund_download_dir[-1] is not "/": lund_download_dir += "/" # A case used to work around not downloading for types 1/3 if lund_location == "no_download": print('Not downloading files due to SCard type.') return lund_location elif os.path.exists(lund_download_dir): print('Lund directory already exists, not downloading again.') return lund_download_dir # Create dir. to download / copy files into try: subprocess.call(['mkdir', '-p', lund_download_dir]) except Exception as e: print("WARNING: unable to make directory {}".format(lund_download_dir)) print("The error encountered was: \n {}".format(e)) ################################################################## # Case 3/4 - download single / multiple files from online location ################################################################## if 'http' in lund_location: # Download single web file if any([ext in lund_location for ext in valid_lund_extensions]): lund_dir_unformatted = lund_location.split("/") lund_filename = lund_dir_unformatted[ -1] # the gets the name of the lund file, assuming the format is http......./lund_file_name #Pass the location, file, and download dir to the downloader function Lund_Downloader(lund_url_base=lund_location, lund_download_dir=lund_download_dir, lund_filename=lund_filename) # Download entire web directory else: try: #Read the given location to find all the lund files raw_html, lund_filenames = html_reader.html_reader( lund_location, valid_lund_extensions) except Exception as e: print("ERROR: unable to download lund files from {}".format( lund_location)) print("The error encountered was: \n {}".format(e)) exit() if len(lund_filenames) == 0: print( "No Lund files found (they must end in '{}'). Is the online repository correct?" .format(valid_lund_extensions)) exit() #Loop through downloading every LUND file in directory for lund_filename in lund_filenames: Lund_Downloader(lund_url_base=lund_location, lund_download_dir=lund_download_dir, lund_filename=lund_filename, single_file=False) ####################################################################### # Case 1/2 - Use RSync to copy files from a jlab location to OSG # RSYNC option: rlpgoD replaces -a (rlptgoD) so time is not preserved: # When copied, the files will have a new timestamp, which will play # nice with our autodeletion cronjobs ###################################################################### else: # Single local file if any([ext in lund_location for ext in valid_lund_extensions]): try: #print("Trying to copy Lund file from {}".format(lund_location)) if lund_location[0] is not "/": lund_location = "/" + lund_location #Example full filepath: gemc@dtn1902-ib:/lustre19/expphy/volatile/clas12/robertej/testlund.txt lund_copy_path = 'gemc@dtn1902-ib:/lustre19/expphy' + lund_location subprocess.call( ['rsync', '-rlpgoD', lund_copy_path, lund_download_dir]) except Exception as e: print("ERROR: unable to copy lund files from {}".format( lund_location)) print("The error encountered was: \n {}".format(e)) # Local directory, many files else: if lund_location[0] is not "/": lund_location = "/" + lund_location if lund_location[-1] is not "/": lund_location += "/" if "/lustre19/expphy" not in lund_location: lund_location = '/lustre19/expphy' + lund_location #print("trying to rsync {}".format(lund_location)) lund_copy_path = 'gemc@dtn1902-ib:' + lund_location #subprocess.call(['rsync', '-a', lund_copy_path, lund_download_dir]) subprocess.call([ 'rsync', '-zrlpgoDv', '--prune-empty-dirs', "--include='*.dat'", "--include='*.txt'", "--exclude='*'", lund_copy_path, lund_download_dir ]) files = os.listdir(lund_download_dir) for f in files: if not any([ext in f for ext in valid_lund_extensions]): os.remove(lund_download_dir + f) return lund_download_dir
def index(): papers = [html_reader('cacm/CACM-%s.html' % convert(i)) for i in num] years = set() for item in papers: years.update((item.metadata[6], item.date[0])) years = list(years) index_year = {year: set() for year in years} for item in papers: index_year[item.date[0]].add(item.id) index_year[item.metadata[6]].add(item.id) for key in index_year: index_year[key] = np.sort(np.array(list(index_year[key]))) year_file = h5py.File("year_index.h5", 'w') for key in index_year: year_file.create_dataset(name=key, data=index_year[key]) year_file.close() months = set() for item in papers: months.update((item.date[1], item.metadata[2])) months = list(months) index_month = {month: set() for month in months} for item in papers: index_month[item.date[1]].add(item.id) index_month[item.metadata[2]].add(item.id) for key in index_month: index_month[key] = np.sort(np.array(list(index_month[key]))) month_file = h5py.File("month_index.h5", 'w') for key in index_month: month_file.create_dataset(name=key, data=index_month[key]) month_file.close() ids = set() for item in papers: ids.add(item.metadata[0]) ids = list(ids) index_id = {id: set() for id in ids} for item in papers: index_id[item.metadata[0]].add(item.id) for key in index_id: index_id[key] = np.sort(np.array(list(index_id[key]))) id_file = h5py.File("id_index.h5", 'w') for key in index_id: id_file.create_dataset(name=key, data=index_id[key]) id_file.close() sigs = set() for item in papers: sigs.add(item.metadata[1]) sigs = list(sigs) index_sig = {sig: set() for sig in sigs} for item in papers: index_sig[item.metadata[1]].add(item.id) for key in index_sig: index_sig[key] = np.sort(np.array(list(index_sig[key]))) sig_file = h5py.File("sig_index.h5", "w") for key in index_sig: sig_file.create_dataset(name=key, data=index_sig[key]) sig_file.close() times = set() for item in papers: times.add(item.metadata[4] + " " + item.metadata[5]) times = list(times) index_time = {time: set() for time in times} for item in papers: index_time[item.metadata[4] + " " + item.metadata[5]].add(item.id) for key in index_time: index_time[key] = np.sort(np.array(list(index_time[key]))) time_file = h5py.File("time_index.h5", "w") for key in index_time: time_file.create_dataset(name=key, data=index_time[key]) time_file.close() authors = set() for item in papers: authors.update(item.author) authors = list(authors) index_author = {author: set() for author in authors} for item in papers: for a in item.author: index_author[a].add(item.id) for key in index_author: index_author[key] = np.sort(np.array(list(index_author[key]))) author_file = h5py.File("author_index.h5", "w") for key in index_author: author_file.create_dataset(name=key, data=index_author[key]) author_file.close() dates = set() for item in papers: dates.add(item.metadata[2] + " " + item.metadata[3]) dates = list(dates) index_date = {date: set() for date in dates} for item in papers: index_date[item.metadata[2] + " " + item.metadata[3]].add(item.id) for key in index_date: index_date[key] = np.sort(np.array(list(index_date[key]))) date_file = h5py.File("date_index.h5", "w") for key in index_date: date_file.create_dataset(name=key, data=index_date[key]) author_file.close() words = set() for item in papers: words.update(item.corpus) words = list(words) index_word = {word: set() for word in words} for item in papers: for word in item.corpus: index_word[word].add(item.id) for key in index_word: index_word[key] = np.sort(np.array(list(index_word[key]))) word_file = h5py.File("word_index.h5", "w") for key in index_word: word_file.create_dataset(name=key, data=index_word[key]) word_file.close()