def create_index(files): """ Given a list of fully-qualified filenames, build an index from word to set of document IDs. A document ID is just the index into the files parameter (indexed from 0) to get the file name. Make sure that you are mapping a word to a set of doc IDs, not a list. For each word w in file i, add i to the set of document IDs containing w Return a dict object mapping a word to a set of doc IDs. """ dct_index = defaultdict() # Create an empty dict for idx, fname in enumerate( files): # Iterate through every given file names s_content = get_text( fname) # Turn each file name into a string content lst_word = words( s_content ) # Turn the string content into a list of normalized words for word in lst_word: # For each normalized words, update the dict by word-file as key-value pairs if word not in dct_index: dct_index[word] = {idx} # If the key doesn't exist, create one else: dct_index[word].add( idx ) # If the key exist, add the file name into the set of the file names under that word return dct_index
def create_index(files): """ Given a list of fully-qualified filenames, build an index from word to set of document IDs. A document ID is just the index into the files parameter (indexed from 0) to get the file name. Make sure that you are mapping a word to a set of doc IDs, not a list. For each word w in file i, add i to the set of document IDs containing w Return a dict object mapping a word to a set of doc IDs. """ document_ID = {} index = {} for i in range(0, len(files)): document_ID[files[i]] = i terms = [] for file in files: terms = get_text(file) terms = words(terms) for term in terms: if index.__contains__(term) == True: index[term].add(document_ID[file]) else: index[term] = {document_ID[file]} return index
def create_index_old(files): """ Given a list of fully-qualified filenames, build an index from word to set of document IDs. A document ID is just the index into the files parameter (indexed from 0) to get the file name. Make sure that you are mapping a word to a set of doc IDs, not a list. For each word w in file i, add i to the set of document IDs containing w Return a dict object mapping a word to a set of doc IDs. """ # create massive list of all words from ALL FILES allWords = [] for file in files: allWords.append( get_text(file)) # extract file contents as massive strings wordsInAllDocuments = [toUnique(words(f)) for f in allWords ] # convert to words per document (used later) allWords = words(" ".join(allWords)) # convert strings into list of words allWords = toUnique(allWords) # make it unique (i.e. no duplicate words) # iterate through words and generate index dictionary = {w: set() for w in allWords} for word in allWords: # loop through all unique words for i, wordsInOneDocument in enumerate( wordsInAllDocuments): # loops through all files if word in wordsInOneDocument: dictionary[word].add(i + 1) return dictionary
def check_service(host: str) -> Verdict: try: with build_session() as session: api = Api(host, session) resp = api.register_user(Randomizer.user()) if resp.status_code != 201: return Verdict.MUMBLE("Can't register user", "Can't register user") file_in_zip, *file = create_zip() resp = api.upload_zip(file) if resp.status_code != 202: return Verdict.MUMBLE("Can't upload file", "Can't upload file") resp = api.search_file(file_in_zip) if file_in_zip not in resp.text: return Verdict.MUMBLE("Can't find file from zip", "Can't find file from zip") resp = api.create_note(get_text(), True) if resp.status_code != 201: return Verdict.MUMBLE("Can't create note", "Can't create note") return Verdict.OK() except Exception as e: return Verdict.DOWN("Can't connect to service", str(e))
def myhtable_create_index(files): """ Build an index from word to set of document indexes This does the exact same thing as create_index() except that it uses your htable. As a number of htable buckets, use 4011. Returns a list-of-buckets hashtable representation. """ # dct_index = defaultdict() # Create an empty dict # for file in files: # Iterate through every given file names # s_content = get_text(file) # Turn each file name into a string content # lst_word = words(s_content) # Turn the string content into a list of normalized words # for word in lst_word: # For each normalized words, update the dict by word-file as key-value pairs # if word not in dct_index: # dct_index[word] = {file} # If the key doesn't exist, create one # else: # dct_index[word].add(file) # If the key exist, add the file name into the set of the file names under that word # return dct_index NBUCKETS = 4011 table = htable(NBUCKETS) # Create an empty dict for idx,fname in enumerate(files): # Iterate through every given file names s_content = get_text(fname) # Turn each file name into a string content lst_word = words(s_content) # Turn the string content into a list of normalized words for word in lst_word: # For each normalized words, update the dict by word-file as key-value pairs set_IDs = htable_get(table, word) if set_IDs == None: htable_put(table, word, {idx}) # index or file name else: set_IDs.add(idx) return table
def linear_search(files, terms): returnFiles = [] searchTerms = set(terms) for item in files: fileWords = set(words(get_text(item))) if(searchTerms < fileWords): returnFiles.append(item) return returnFiles
def myhtable_create_index(files): wordBook = htable(4011) fileIndex = 0 for item in files: fileWords = set(words(get_text(item))) for word in fileWords: htable_put(wordBook,word, fileIndex) fileIndex += 1 return wordBook
def create_index(files): d = {} for k, file in enumerate(files): # loop through files wordsInDoc = words(get_text(file)) for word in wordsInDoc: # loop through words in that file if word not in d: d[word] = {files[k]} else: d[word].add(files[k]) # print("word {:d} ({:<14s}), doc {:d}".format(i+1, word, k)) # warning: x6 runtime! return d
def linear_search(files, terms): """ Given a list of fully-qualified filenames, return a list of them whose file contents has all words in terms as normalized by your words() function. Parameter terms is a list of strings. Perform a linear search, looking at each file one after the other. """ result = [] for file in files: contents = get_text(file) contents = words(contents) terms = pd.Series(terms) if all(terms.isin(contents)) == True: result.append(file) return result
def linear_search(files, terms): """ Given a list of fully-qualified filenames, return a list of them whose file contents has all words in terms as normalized by your words() function. Parameter terms is a list of strings. Perform a linear search, looking at each file one after the other. """ final_list = [] set_terms = set(terms) for article in files: data = get_text(articles) new_data = set(words(data)) if set_terms.issubset(new_data): final_list.append(articles) return final_list
def myhtable_create_index(files): """ Build an index from word to set of document indexes This does the exact same thing as create_index() except that it uses your htable. As a number of htable buckets, use 4011. Returns a list-of-buckets hashtable representation. """ nbuckets = 4011 table = htable(nbuckets) for value in range(0, len(files)): terms = get_text(files[value]) terms = words(terms) for key in terms: table = htable_put(table, key, {value}) return table
def linear_search(files, terms): """ Given a list of fully-qualified filenames, return a list of them whose file contents has all words in terms as normalized by your words() function. Parameter terms is a list of strings. Perform a linear search, looking at each file one after the other. """ # path = "~/data/slate" # for path_name, subdir, f_name in os.walk(path): # for f in files: # if f in files: # path # p = os.path.join(path_name, f_name) # Can't use "os." # s = get_text(f) # print(s) all_in = True lst_qualified = [] for idx, file in enumerate(files): # print(idx, f) # s = get_text(f) # Use the existing function words() # for term in terms: # if term not in s: # Check if all the terms are contained in the file # all_in = False # if all_in == True: # Then this file is fully-qualified # lst_qualified.append(file) # print("!!!!!!") # print(idx, f) # with open(file) as f: # lst = f.readlines() # for line in lst: # for term in terms: # if term not in : # Check if all the terms are contained in the file # all_in = False # if all_in == True: # Then this file is fully-qualified # lst_qualified.append(file) # print("!!!!!!") if set(terms) == set(words(get_text(file))).intersection(set(terms)): lst_qualified.append(file) # print("!!!!!!") return lst_qualified
def linear_search(files, terms): """ Given a list of fully-qualified filenames, return a list of them whose file contents has all words in terms as normalized by your words() function. Parameter terms is a list of strings. Perform a linear search, looking at each file one after the other. """ listOfFiles = [] for file in files: # convert to list of words allWordsInFile = words(get_text(file)) # check to see if the search terms are subsets of the file words if set(terms).issubset(allWordsInFile): listOfFiles.append(file) return (listOfFiles)
def create_index(files): """ Given a list of fully-qualified filenames, build an index from word to set of document IDs. A document ID is just the index into the files parameter (indexed from 0) to get the file name. Make sure that you are mapping a word to a set of doc IDs, not a list. For each word w in file i, add i to the set of document IDs containing w Return a dict object mapping a word to a set of doc IDs. """ if len(files) <= 0: return None index = defaultdict(set) for i in range(len(files)): file_content = get_text(files[i]) key_words = words(file_content) for word in key_words: index[word].add(i) return index
def myhtable_create_index(files): """ Build an index from word to set of document indexes This does the exact same thing as create_index() except that it uses your htable. As a number of htable buckets, use 4011. Returns a list-of-buckets hashtable representation. """ d = htable(4011) # initialize empty htable # k = 0 for k, file in enumerate(files): # loop through files # k = k + 1 wordsInDoc = words(get_text(file)) # print("len doc {:<4d}: {:<6d}".format(k, len(wordsInDoc))) for word in wordsInDoc: # loop through words in that file htable_put(d, word, {files[k]}) # print("word {:d} ({:<14s}), doc {:d}".format(i+1, word, k)) # warning: x6 runtime! #pp = pprint.PrettyPrinter(indent=4) #pp.pprint(d) return d
def create_index(files): """ Given a list of fully-qualified filenames, build an index from word to set of document IDs. A document ID is just the index into the files parameter (indexed from 0) to get the file name. Make sure that you are mapping a word to a set of doc IDs, not a list. For each word w in file i, add i to the set of document IDs containing w Return a dict object mapping a word to a set of doc IDs. """ wordlist = [words(get_text(files[i])) for i in range(len(files))] combinelist = defaultdict(set) for i in range(len(files)): d = dict.fromkeys(wordlist[i], i) for key, value in d.items(): combinelist[key].add(value) return combinelist
def myhtable_create_index(files): """ Build an index from word to set of document indexes This does the exact same thing as create_index() except that it uses your htable. As a number of htable buckets, use 4011. Returns a list-of-buckets hashtable representation. """ wordlist = [words(get_text(files[i])) for i in range(len(files))] table = htable(4011) for i in range(len(files)): for j in range(len(wordlist[i])): htable_put(table, wordlist[i][j], set()) for i in range(len(files)): for j in range(len(wordlist[i])): htable_get(table, wordlist[i][j]).add(i) return table
def myhtable_create_index(files): """ Build an index from word to set of document indexes This does the exact same thing as create_index() except that it uses your htable. As a number of htable buckets, use 4011. Returns a list-of-buckets hashtable representation. """ if len(files) <= 0: return None table = htable(4011) for i in range(len(files)): file_content = get_text(files[i]) key_words = words(file_content) for word in key_words: # because the value is a set, whenever a value # is added to hash table here, if the key is # is already in the hash table, the new value # is going to merged to the existing value. htable_put(table, word, set([i])) return table
def linear_search(files, terms): """ Given a list of fully-qualified filenames, return a list of them whose file contents has all words in terms as normalized by your words() function. Parameter terms is a list of strings. Perform a linear search, looking at each file one after the other. """ if files == None or terms == None or len(files) == 0 or len(terms) == 0: return None ret_docs = [] for file in files: file_content = get_text(file) all_terms_not_found = False words_in_file = words(file_content) for term in terms: if term not in words_in_file: # if any term is not found in the file # set the flag all_terms_not_found dirty all_terms_not_found = True break if all_terms_not_found is False: ret_docs.append(file) return ret_docs