def read_data(): _vocab = load_list(FLAGS.data_dir, '.vocab') _labels = load_list(FLAGS.data_dir, '.labels') if FLAGS.format == 'pkl': with open(FLAGS.input, 'rb') as f: data = pickle.load(f) elif FLAGS.format == 'iob': with open(FLAGS.input) as f: length = 0 _length = 0 for line in f: if line == '\n' and length > 0: _length = max(_length, length) length = 0 else: length += 1 if _length > FLAGS.num_steps: raise ValueError('Max sequence length %i > num_steps %i' % (_length, FLAGS.num_steps)) else: _length = FLAGS.num_steps with open(FLAGS.input) as f: data = {'tokens': [], 'labels': [], 'lengths': [], 'weights': []} tokens = [] labels = [] for line in f: if line == '\n' and tokens and labels: length = len(tokens) weights = [1] * len(tokens) + [0] * (_length - length) tokens += [0] * (_length - length) labels += [0] * (_length - length) data['tokens'].append(tokens) data['labels'].append(labels) data['lengths'].append(length) data['weights'].append(weights) tokens = [] labels = [] else: token, label = line.split() if token in _vocab: tokens.append(_vocab.index(token)) else: tokens.append(1) labels.append(_labels.index(label)) elif FLAGS.format == 'txt': raise ValueError('Format not supported yet.') else: raise ValueError('Unknown file format %s.' % FLAGS.format) return _vocab, _labels, data
def read_part_of_posting(self, posting, num_of_file, last_file=False, first_read=False): """gets a posting NAME and it's index!! and reads it's content from the disk store the file descriptor of current posting file""" num_of_file += 1 # this gives values of 1..* to file names, skipping 0 with open(posting, 'rb') as pickle_in: # pickle_in = open("{}".format(posting), "rb") if num_of_file in self.file_descriptor_dict: fdr = self.file_descriptor_dict[num_of_file] pickle_in.seek(fdr) part_of_posting = [] if int(Indexer.NUM_OF_TERMS_IN_POSTINGS / Indexer.PICKLE_COUNTER) > 0: amount_to_read = int(Indexer.NUM_OF_TERMS_IN_POSTINGS / Indexer.PICKLE_COUNTER) else: amount_to_read = Indexer.NUM_OF_TERMS_IN_POSTINGS if last_file: amount_to_read = Indexer.NUM_OF_TERMS_IN_POSTINGS # amount_to_read = 2325 if first_read: for i in range(amount_to_read): try: key_value = utils.load_list(pickle_in) # key_value = pickle.load(pickle_in) part_of_posting.append(key_value) except: break else: for i in range(amount_to_read): try: key_value = utils.load_list(pickle_in) # key_value = pickle.load(pickle_in) part_of_posting.append(key_value) self.values_size += len(key_value[1]) if self.values_size >= 2000000: self.values_size = 0 break except: break self.file_descriptor_dict[num_of_file] = pickle_in.tell() # pickle_in.close() return part_of_posting
def read_clust_output(fname, max_pval): lines = ut.load_list(fname)[1:] cxs = [set(line.split(',')[7].strip('"').split()) for line in lines] pvals = [line.split(',')[6] for line in lines] details = [','.join(line.split(',')[:7]) for line in lines] print "Retaining complexes with p < %0.2f." % max_pval cxs,pvals,details = keep_pvals(cxs,pvals,details, max_pval) return cxs,pvals,details
def load_prots_from_fasta(fname): """ Files are in ut.config()['fastadir']. Returns a set since usually I'm searching against it. """ protlines = [l for l in ut.load_list(fname) if l[0]=='>'] genes = set([l.split(' ')[0].strip('>') for l in protlines]) return genes
def load_prots_from_fasta_dep(fname): """ Files are in ut.config()['fastadir']. All so far can be split by both space and |. Returns a set since usually I'm searching against it. """ protlines = [l[1:] for l in ut.load_list(fname) if l[0]=='>'] prots = set([l.split(' ')[0].split('|')[0] for l in protlines]) return prots
def _load_prots_to_lol(fname): prots = ut.load_list(fname) prots_clean = [] for line in prots: if line[0] == '>': prots_clean.append([line]) else: prots_clean[-1].append(line) return prots_clean
def load_index(self, fn): """ Loads a pre-computed index (or indices) so we can answer queries. Input: fn - file name of pickled index. """ listObj = utils.load_list(fn) self.inverted_idx = listObj[0] self.postingDict = listObj[1] self.documents = listObj[2]
def multi_identities(input_fname, out_dir): input_list = ut.load_lol(input_fname) for desc, prots_fname, source_fasta, odict, target in input_list: print "%s, proteins: %s\n source: %s\n odict: %s\ntarget: %s" % (desc, prots_fname, source_fasta, odict, target) prots = ut.load_list(prots_fname) sims = all_identities(prots, odict, source_fasta, target) out_fname = os.path.join(out_dir, ut.shortname(target).split('.')[0] + "_" + desc + ".txt") ut.write_tab_file(sims, out_fname, islist=True)
def figures(recon_fname, exclude_recon_fname, kegg_fname, all_ppis_fname, recon_pairs=None, do_plot_cdf=False, return_pairs=False): rpairs = recon_pairs or load_seq_pairs(recon_fname, ut.load_list(exclude_recon_fname)) kpairs = load_kegg_sequentials(kegg_fname) pdk = pd.PairDict(kpairs) intpairs = [p for p in rpairs if pdk.contains(p)] ppis = pu.load_ppis(all_ppis_fname) plot_pairs_randoms_etc(intpairs, ppis) if do_plot_cdf: plot_cdf_pos_randoms(intpairs, ppis) if return_pairs: return intpairs
def prots2genes(fname): """ If there's only one item in the first line, just return a dummy dict mapping each id to itself. Otherwise, assume the line begins with >GENEID and ends with protein:PROTEINID. """ lines = [l for l in ut.load_list(fname) if len(l)>0 and l[0]=='>'] if len(lines[0].split())==1: return dict([(g,g) for g in [l.strip('>') for l in lines]]) elif len(lines[0].split(':'))==1: # Xl return dict([(g,g) for g in [l.split()[0].strip('>') for l in lines]]) else: return dict([(p.split()[-1].split(':')[1], p.split()[0].strip('>')) for p in lines])
def merge_chunks(self): """ performs a K-way merge on the posting files -> N disk accesses writes new posting files to the disk. :return: """ saved_chunks = [] chunks_indices = np.zeros(shape=(len(self.locations_at_postings)), dtype=np.int32) chunk_length = self.postingDict_size // len( self.locations_at_postings) + 1 # inserts the chunks into a chunked list for key in self.locations_at_postings: loaded, offset = utils.load_list(key, self.config.get_out_path(), self.locations_at_postings[key], chunk_length) saved_chunks.append(loaded) self.locations_at_postings[key] = offset building_list = [] all_empty = True # loops through as long as all postings files didn't finish running. while all_empty: should_enter = -1 # loops through as long as one of the chunks is not done. while should_enter == -1: term_to_enter = self.find_term(saved_chunks, chunks_indices) tuples_to_merge = [] indexes_of_the_indexes_to_increase = [] # find all tuples that should be merged and the indices should be increased for idx, term_idx_in_chunk in enumerate(chunks_indices): if term_idx_in_chunk < len(saved_chunks[idx]) and \ saved_chunks[idx][term_idx_in_chunk][0] == term_to_enter: tuples_to_merge.append( saved_chunks[idx][term_idx_in_chunk]) indexes_of_the_indexes_to_increase.append(idx) merged_tuple = self.merge_terms_into_one(tuples_to_merge) appended_term = merged_tuple[0] should_append = True # if it is a named entity and it exists in less than 2 tweets, erase this term. if appended_term in self.entities_dict and self.entities_dict[ appended_term] < 2: should_append = False self.inverted_idx.pop(appended_term, None) # update terms with capital letters if appended_term in self.global_capitals and self.global_capitals[ appended_term]: merged_tuple = (appended_term.upper(), merged_tuple[1]) inverted_val = self.inverted_idx[appended_term] self.inverted_idx.pop(appended_term, None) self.inverted_idx[appended_term.upper()] = inverted_val appended_term = merged_tuple[0] if appended_term in self.inverted_idx and self.inverted_idx[ appended_term][0] == 1: should_append = False self.inverted_idx.pop(appended_term, None) if should_append: self.accumulative_size += len(merged_tuple[1]) building_list.append(merged_tuple) self.inverted_idx[merged_tuple[0]][1] = str( self.counter_of_postings) # increase the indices that the tuple at the specific location have been inserted to the new posting for idx in indexes_of_the_indexes_to_increase: chunks_indices[idx] += 1 should_enter = self.update_should_enter( saved_chunks, chunks_indices) # saving happens as soon as the size reaches given max size of the final posting if self.accumulative_size >= self.max_accumulative: self.merged_dicts.append(str(self.counter_of_postings)) utils.save_list(building_list, str(self.counter_of_postings), self.config.get_out_path()) self.accumulative_size = 0 self.counter_of_postings += 1 building_list = [] # loads new chunks into the save_chunks list in the relevant indices. for index in should_enter: loaded, offset = utils.load_list( str(index), self.config.get_out_path(), self.locations_at_postings[str(index)], chunk_length) saved_chunks[index] = loaded chunks_indices[index] = 0 self.locations_at_postings[str(index)] = offset # checks whether all postings are done. all_empty = False for chunk in saved_chunks: if len(chunk) > 0: all_empty = True break # save of the last posting file. if len(building_list) > 0: self.merged_dicts.append(str(self.counter_of_postings)) utils.save_list(building_list, str(self.counter_of_postings), self.config.get_out_path())
import os, re from utils import load_list, write_list, printProgressBar sites = ['gamepedia', 'LeagueFandom', 'mobafire'] # get the list of all full text files for all sites files = list() for site in sites: folder = os.path.join("data", site + "Data") filelist = os.listdir(folder) files = files + [ os.path.join(folder, b) for b in filelist if "fullText" in b ] # get the words to count terms = load_list("data/filteredTerms.txt") urls = list() counts = np.zeros([len(files), len(terms)]) countFilename = "data/counts.txt" removeText = load_list('removeText.txt') # Write the first line of the counts filename, which is terms with open(countFilename, 'w+', encoding='utf-8', errors='ignore') as f: f.write("site\t") for t in terms: f.write(t + "\t") f.write("\n") # main loop for idx, file in enumerate(files):
edge.target]["community"] if is_delete: r = random.random() w = edge["weight"] if f(w) < r: is_delete = False return is_delete if __name__ == "__main__": pass start_time = time.time() path = "./Dataset/category_input" selected_categories = utils.load_list(path) print("Selected {} categories : {}".format(len(selected_categories), selected_categories)) tags = [] map_pair_tag_occ = {} for i, cat in enumerate(selected_categories): print("{}/{} ...".format(i + 1, len(selected_categories))) path = "./Dataset/refined_data/Pair_Tag/{}.csv".format(cat) df = utils.load_csv(path) for _, row in df.iterrows(): tag1, tag2, occ = "_{}".format(row["Tag1"]), "_{}".format( row["Tag2"]), int(row["Num_Occurrence"]) tags.extend([tag1, tag2]) old_occ = map_pair_tag_occ.get((tag1, tag2), 0) map_pair_tag_occ.update({(tag1, tag2): old_occ + occ})
import utils import re files = ["data/mobafire_CompletedPagesList.txt"] pages = utils.load_list(files[0]) terms = set() for p in pages: words = p.split('/') for w in words: if '-' in w: # replace '-' with ' ' w = w.split('-') # remove the last part if it is a number or "guide" if w[-1] == 'guide' or w[-1].isnumeric(): w = w[:-1] w = " ".join(w) terms.add(w.lower().strip()) mobafireTermsList = list(terms) mobafireTermsList.sort() file = "data/LeagueFandom_CompletedPagesList.txt" pages = utils.load_list(file) terms = set() termsToReplace = [('_', ' '), ('%27', '\''), ('(item)', ''), ('%26', '&')] for p in pages: if "Teamfight_Tactics" in p: continue words = p.split('/') for w in words: for t in termsToReplace: w = w.replace(t[0], t[1])
# making it easier to load and view the words of most interest. # The number of words in each file can be set via command line (-b or --batch argument), default to 500 words import numpy as np import pandas as pd from utils import load_list, write_list, printProgressBar import argparse # command line argument for how to split the words parser = argparse.ArgumentParser() parser.add_argument('-b', '--batch', type=int, default=500, action='store') args = parser.parse_args() # load terms and urls print("loading files...") terms = load_list("data/filteredTerms.txt") urls = load_list("data/urls.txt") counts = np.zeros([len(urls) + 1, len(terms)]) with open("data/counts.txt", 'r', encoding='utf-8', errors='ignore') as f: idx = 0 l = f.readline() l = f.readline() while l != "": #clear_output(wait=True) printProgressBar(idx, len(urls) - 1, length=25, suffix=urls[idx] + " " * (100 - len(urls[idx]))) #print(idx,"of",len(urls)-1,urls[idx]) counts[idx, :] = l.split("\t")[1:-1] l = f.readline()
import numpy as np from matplotlib import pyplot as plt from utils import load_list loss = load_list(r'data/loss.txt') loss = np.array([float(x) for x in loss]) acc = load_list(r'data/acc.txt') acc = np.array([float(x) for x in acc]) plt.plot(range(len(loss)), loss) plt.plot(range(len(acc)), acc) plt.show()
import utils import nltk filteredTerms = utils.load_list('data/filteredTerms.txt') filteredTerms.sort() dupeList = dict() for idx, t in enumerate(list(filteredTerms)): surroundingWords = filteredTerms[idx-3:idx+3] l = [a for a in surroundingWords if t!=a and nltk.edit_distance(t,a) <= 1] if len(l) > 0: dupeList[t] = l for d in dupeList: print(d, dupeList[d]) keys = list(dupeList.keys()) dupeList['attack'] ing = [word for word in filteredTerms if 'ing' in word] sWords = [word for word in filteredTerms if 's' == word[-1] and '\''!= word[-2]] nltk.edit_distance('ability','abilities')
] # Go through remove list removeList = list() for r in badList: print(r) resp = requests.get(r) strainer = SoupStrainer(class_=['category-page__member']) soup = BeautifulSoup(resp.content,'lxml',parse_only=strainer) l = list() #soup.find('div', id="top-schedule").decompose() for t in soup.find_all("a"): if 'href' in t.attrs and "http" not in t['href'] and "#" not in t['href'] and "action=edit" not in t['href']: link = t['href'] # remove '/wiki/' from beginning of link link = link.replace("/wiki/","") l.append(link) for a in l: removeList.append(a) time.sleep(1) removeList = list(set(removeList)) baseList = utils.load_list("data/LeagueFandom.txt") # Go through every item in removeList and remove it from baseList for w in removeList: baseList = [b for b in baseList if b!=w] utils.write_list(baseList,'data/LeagueFandom.txt')
# removes other errors in terms # finally removes duplicates # author: Zack Wisti stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"] championList = load_list('championList.txt') championList = [b.lower() for b in championList] itemList = load_list('itemList.txt') itemList = [b.lower() for b in itemList] def filterTerms(terms,banlist=None): terms = [b.lower() for b in terms] # get the stem from any thing that is directory format bonus = list() for b in terms: if "/" in b: bonus += b.split("/") terms += bonus terms = [b for b in terms if not "/" in b] # replace _ with a space terms = [re.sub("_"," ", b) for b in terms]