Python get_textの例、words.get_text Pythonの例

コード例 #1

0

ファイルを表示

ファイル: index_search.py プロジェクト: SeannLin/search_engine

def create_index(files):
    """
    Given a list of fully-qualified filenames, build an index from word
    to set of document IDs. A document ID is just the index into the
    files parameter (indexed from 0) to get the file name. Make sure that
    you are mapping a word to a set of doc IDs, not a list.
    For each word w in file i, add i to the set of document IDs containing w
    Return a dict object mapping a word to a set of doc IDs.
    """

    dct_index = defaultdict()  # Create an empty dict
    for idx, fname in enumerate(
            files):  # Iterate through every given file names
        s_content = get_text(
            fname)  # Turn each file name into a string content
        lst_word = words(
            s_content
        )  # Turn the string content into a list of normalized words
        for word in lst_word:  # For each normalized words, update the dict by word-file as key-value pairs
            if word not in dct_index:
                dct_index[word] = {idx}  # If the key doesn't exist, create one
            else:
                dct_index[word].add(
                    idx
                )  # If the key exist, add the file name into the set of the file names under that word

    return dct_index

コード例 #2

0

ファイルを表示

def create_index(files):
    """
    Given a list of fully-qualified filenames, build an index from word
    to set of document IDs. A document ID is just the index into the
    files parameter (indexed from 0) to get the file name. Make sure that
    you are mapping a word to a set of doc IDs, not a list.
    For each word w in file i, add i to the set of document IDs containing w
    Return a dict object mapping a word to a set of doc IDs.
    """
    document_ID = {}
    index = {}
    for i in range(0, len(files)):
        document_ID[files[i]] = i
    terms = []

    for file in files:
        terms = get_text(file)
        terms = words(terms)
        for term in terms:
            if index.__contains__(term) == True:
                index[term].add(document_ID[file])

            else:
                index[term] = {document_ID[file]}
    return index

コード例 #3

0

ファイルを表示

ファイル: index_search.py プロジェクト: loulai/search-engine-implementation

def create_index_old(files):
    """
    Given a list of fully-qualified filenames, build an index from word
    to set of document IDs. A document ID is just the index into the
    files parameter (indexed from 0) to get the file name. Make sure that
    you are mapping a word to a set of doc IDs, not a list.
    For each word w in file i, add i to the set of document IDs containing w
    Return a dict object mapping a word to a set of doc IDs.
    """
    # create massive list of all words from ALL FILES
    allWords = []
    for file in files:
        allWords.append(
            get_text(file))  # extract file contents as massive strings

    wordsInAllDocuments = [toUnique(words(f)) for f in allWords
                           ]  # convert to words per document (used later)
    allWords = words(" ".join(allWords))  # convert strings into list of words
    allWords = toUnique(allWords)  # make it unique (i.e. no duplicate words)

    # iterate through words and generate index

    dictionary = {w: set() for w in allWords}

    for word in allWords:  # loop through all unique words
        for i, wordsInOneDocument in enumerate(
                wordsInAllDocuments):  # loops through all files
            if word in wordsInOneDocument:
                dictionary[word].add(i + 1)
    return dictionary

コード例 #4

0

ファイルを表示

def check_service(host: str) -> Verdict:
    try:
        with build_session() as session:
            api = Api(host, session)

            resp = api.register_user(Randomizer.user())
            if resp.status_code != 201:
                return Verdict.MUMBLE("Can't register user", "Can't register user")

            file_in_zip, *file = create_zip()
            resp = api.upload_zip(file)
            if resp.status_code != 202:
                return Verdict.MUMBLE("Can't upload file", "Can't upload file")

            resp = api.search_file(file_in_zip)
            if file_in_zip not in resp.text:
                return Verdict.MUMBLE("Can't find file from zip", "Can't find file from zip")

            resp = api.create_note(get_text(), True)
            if resp.status_code != 201:
                return Verdict.MUMBLE("Can't create note", "Can't create note")

            return Verdict.OK()
    except Exception as e:
        return Verdict.DOWN("Can't connect to service", str(e))

コード例 #5

0

ファイルを表示

def myhtable_create_index(files):
    """
    Build an index from word to set of document indexes
    This does the exact same thing as create_index() except that it uses
    your htable.  As a number of htable buckets, use 4011.
    Returns a list-of-buckets hashtable representation.
    """

    # dct_index = defaultdict() # Create an empty dict
    # for file in files: # Iterate through every given file names
    #     s_content = get_text(file) # Turn each file name into a string content
    #     lst_word = words(s_content) # Turn the string content into a list of normalized words
    #     for word in lst_word: # For each normalized words, update the dict by word-file as key-value pairs
            # if word not in dct_index:
            #     dct_index[word] = {file} # If the key doesn't exist, create one
            # else:
            #     dct_index[word].add(file) # If the key exist, add the file name into the set of the file names under that word
    # return dct_index


    NBUCKETS = 4011
    table = htable(NBUCKETS) # Create an empty dict
    for idx,fname in enumerate(files): # Iterate through every given file names
        s_content = get_text(fname) # Turn each file name into a string content
        lst_word = words(s_content) # Turn the string content into a list of normalized words
        for word in lst_word: # For each normalized words, update the dict by word-file as key-value pairs
            set_IDs = htable_get(table, word)
            if set_IDs == None:
                htable_put(table, word, {idx}) # index or file name
            else:
                set_IDs.add(idx)
    return table

コード例 #6

0

ファイルを表示

ファイル: linsearch.py プロジェクト: spencersmith6/hashtable

def linear_search(files, terms):
    returnFiles = []
    searchTerms = set(terms)
    for item in files:
        fileWords = set(words(get_text(item)))
        if(searchTerms < fileWords):
            returnFiles.append(item)


    return returnFiles

コード例 #7

0

ファイルを表示

ファイル: myhtable_search.py プロジェクト: spencersmith6/hashtable

def myhtable_create_index(files):
    wordBook = htable(4011)

    fileIndex = 0
    for item in files:
        fileWords = set(words(get_text(item)))
        for word in fileWords:
            htable_put(wordBook,word, fileIndex)
        fileIndex += 1
    return wordBook

コード例 #8

0

ファイルを表示

ファイル: index_search.py プロジェクト: loulai/search-engine-implementation

def create_index(files):
    d = {}
    for k, file in enumerate(files):  # loop through files
        wordsInDoc = words(get_text(file))
        for word in wordsInDoc:  # loop through words in that file
            if word not in d:
                d[word] = {files[k]}
            else:
                d[word].add(files[k])
            # print("word {:d} ({:<14s}), doc {:d}".format(i+1, word, k)) # warning: x6 runtime!
    return d

コード例 #9

0

ファイルを表示

ファイル: linear_search.py プロジェクト: afcarl/DataAcquisition

def linear_search(files, terms):
    """
    Given a list of fully-qualified filenames, return a list of them
    whose file contents has all words in terms as normalized by your words() function.
    Parameter terms is a list of strings.
    Perform a linear search, looking at each file one after the other.
    """
    result = []
    for file in files:
        contents = get_text(file)
        contents = words(contents)
        terms = pd.Series(terms)
        if all(terms.isin(contents)) == True:
            result.append(file)
    return result

コード例 #10

0

ファイルを表示

ファイル: linear_search.py プロジェクト: ksyii/search-ksyii

def linear_search(files, terms):
    """
    Given a list of fully-qualified filenames, return a list of them
    whose file contents has all words in terms as normalized by your words() function.
    Parameter terms is a list of strings.
    Perform a linear search, looking at each file one after the other.
    """
    final_list = []
    set_terms = set(terms)
    for article in files:
    	data = get_text(articles)
    	new_data = set(words(data))
    	if set_terms.issubset(new_data):
    		final_list.append(articles)
    return final_list

コード例 #11

0

ファイルを表示

ファイル: myhtable_search.py プロジェクト: afcarl/DataAcquisition

def myhtable_create_index(files):
    """
    Build an index from word to set of document indexes
    This does the exact same thing as create_index() except that it uses
    your htable.  As a number of htable buckets, use 4011.
    Returns a list-of-buckets hashtable representation.
    """
    nbuckets = 4011
    table = htable(nbuckets)
    for value in range(0, len(files)):
        terms = get_text(files[value])
        terms = words(terms)
        for key in terms:
            table = htable_put(table, key, {value})

    return table

コード例 #12

0

ファイルを表示

ファイル: linear_search.py プロジェクト: SeannLin/search_engine

def linear_search(files, terms):
    """
    Given a list of fully-qualified filenames, return a list of them
    whose file contents has all words in terms as normalized by your words() function.
    Parameter terms is a list of strings.
    Perform a linear search, looking at each file one after the other.
    """

    # path = "~/data/slate"
    # for path_name, subdir, f_name in os.walk(path):
    #     for f in files:
    #         if f in files:
    #             path
    #             p = os.path.join(path_name, f_name) # Can't use "os."
    #             s = get_text(f)
    #             print(s)
    all_in = True
    lst_qualified = []
    for idx, file in enumerate(files):

        # print(idx, f)
        # s = get_text(f) # Use the existing function words()
        # for term in terms:
        #     if term not in s: # Check if all the terms are contained in the file
        #         all_in = False
        # if all_in == True: # Then this file is fully-qualified
        #     lst_qualified.append(file)
        #     print("!!!!!!")
        # print(idx, f)

        # with open(file) as f:
        #     lst = f.readlines()
        #     for line in lst:
        #         for term in terms:
        #             if term not in : # Check if all the terms are contained in the file
        #                 all_in = False
        #         if all_in == True: # Then this file is fully-qualified
        #             lst_qualified.append(file)
        #             print("!!!!!!")

        if set(terms) == set(words(get_text(file))).intersection(set(terms)):
            lst_qualified.append(file)
            # print("!!!!!!")

    return lst_qualified

コード例 #13

0

ファイルを表示

def linear_search(files, terms):
    """
    Given a list of fully-qualified filenames, return a list of them
    whose file contents has all words in terms as normalized by your words() function.
    Parameter terms is a list of strings.
    Perform a linear search, looking at each file one after the other.
    """
    listOfFiles = []

    for file in files:
        # convert to list of words
        allWordsInFile = words(get_text(file))

        # check to see if the search terms are subsets of the file words
        if set(terms).issubset(allWordsInFile):
            listOfFiles.append(file)

    return (listOfFiles)

コード例 #14

0

ファイルを表示

def create_index(files):
    """
    Given a list of fully-qualified filenames, build an index from word
    to set of document IDs. A document ID is just the index into the
    files parameter (indexed from 0) to get the file name. Make sure that
    you are mapping a word to a set of doc IDs, not a list.
    For each word w in file i, add i to the set of document IDs containing w
    Return a dict object mapping a word to a set of doc IDs.
    """
    if len(files) <= 0:
        return None

    index = defaultdict(set)
    for i in range(len(files)):
        file_content = get_text(files[i])
        key_words = words(file_content)
        for word in key_words:
            index[word].add(i)
    return index

コード例 #15

0

ファイルを表示

ファイル: myhtable_search.py プロジェクト: loulai/search-engine-implementation

def myhtable_create_index(files):
    """
    Build an index from word to set of document indexes
    This does the exact same thing as create_index() except that it uses
    your htable.  As a number of htable buckets, use 4011.
    Returns a list-of-buckets hashtable representation.
    """
    d = htable(4011)  # initialize empty htable
    # k = 0
    for k, file in enumerate(files):  # loop through files
        # k = k + 1
        wordsInDoc = words(get_text(file))
        # print("len doc {:<4d}: {:<6d}".format(k, len(wordsInDoc)))
        for word in wordsInDoc:  # loop through words in that file
            htable_put(d, word, {files[k]})
            # print("word {:d} ({:<14s}), doc {:d}".format(i+1, word, k)) # warning: x6 runtime!
    #pp = pprint.PrettyPrinter(indent=4)
    #pp.pprint(d)
    return d

コード例 #16

0

ファイルを表示

def create_index(files):
    """
    Given a list of fully-qualified filenames, build an index from word
    to set of document IDs. A document ID is just the index into the
    files parameter (indexed from 0) to get the file name. Make sure that
    you are mapping a word to a set of doc IDs, not a list.
    For each word w in file i, add i to the set of document IDs containing w
    Return a dict object mapping a word to a set of doc IDs.
    """

    wordlist = [words(get_text(files[i])) for i in range(len(files))]

    combinelist = defaultdict(set)

    for i in range(len(files)):
        d = dict.fromkeys(wordlist[i], i)
        for key, value in d.items():
            combinelist[key].add(value)

    return combinelist

コード例 #17

0

ファイルを表示

def myhtable_create_index(files):
    """
    Build an index from word to set of document indexes
    This does the exact same thing as create_index() except that it uses
    your htable.  As a number of htable buckets, use 4011.
    Returns a list-of-buckets hashtable representation.
    """
    wordlist = [words(get_text(files[i])) for i in range(len(files))]
    table = htable(4011)

    for i in range(len(files)):
        for j in range(len(wordlist[i])):
            htable_put(table, wordlist[i][j], set())

    for i in range(len(files)):

        for j in range(len(wordlist[i])):

            htable_get(table, wordlist[i][j]).add(i)
    return table

コード例 #18

0

ファイルを表示

def myhtable_create_index(files):
    """
    Build an index from word to set of document indexes
    This does the exact same thing as create_index() except that it uses
    your htable.  As a number of htable buckets, use 4011.
    Returns a list-of-buckets hashtable representation.
    """
    if len(files) <= 0:
        return None

    table = htable(4011)
    for i in range(len(files)):
        file_content = get_text(files[i])
        key_words = words(file_content)
        for word in key_words:
            # because the value is a set, whenever a value
            # is added to hash table here, if the key is
            # is already in the hash table, the new value
            # is going to merged to the existing value.
            htable_put(table, word, set([i]))
    return table

コード例 #19

0

ファイルを表示

def linear_search(files, terms):
    """
    Given a list of fully-qualified filenames, return a list of them
    whose file contents has all words in terms as normalized by your words() function.
    Parameter terms is a list of strings.
    Perform a linear search, looking at each file one after the other.
    """
    if files == None or terms == None or len(files) == 0 or len(terms) == 0:
        return None
    ret_docs = []

    for file in files:
        file_content = get_text(file)
        all_terms_not_found = False
        words_in_file = words(file_content)
        for term in terms:
            if term not in words_in_file:
                # if any term is not found in the file
                # set the flag all_terms_not_found dirty
                all_terms_not_found = True
                break
        if all_terms_not_found is False:
            ret_docs.append(file)
    return ret_docs