Python Indexer.create_index Examples

Programming Language: Python

Namespace/Package Name: indexer

Class/Type: Indexer

Method/Function: create_index

Examples at hotexamples.com: 7

Python Indexer.create_index - 7 examples found. These are the top rated real world Python examples of indexer.Indexer.create_index extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

add_new_doc(30)

Indexer(30)

create_index(6)

create_unigram_index(3)

calculate_idf(3)

LoadIndexes(3)

close(3)

dump(3)

coords_to_indices(2)

indices_to_coords(2)

calculationSummerize(2)

add_idf_to_dictionary(2)

add_document(2)

LoadDict(2)

fix_inverted_index(2)

finish(2)

evaluate_input(1)

execute(1)

create_save_indexer_with_relevant_docs(1)

entities_and_small_big(1)

directory(1)

delete_dict_after_saving(1)

create_indexer(1)

create_dirs(1)

create_bulk_index_string(1)

finish_index(1)

CreatInvertedIndex(1)

finish_indexing(1)

get_num_spatial_nodes(1)

tokenize(1)

set_idx_fields(1)

process(1)

keys(1)

isStopword(1)

ignore_extensions(1)

get__lda__(1)

fit(1)

getStemmed(1)

getOr(1)

getAnd(1)

get(1)

generate_local_index(1)

create_block(1)

generate_global_index(1)

compute_tf(1)

createIndex(1)

add_square_Wij(1)

bp_index(1)

batch_get_feat_stacked(1)

after_indexing(1)

Example #1

Show file

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("index_file", help="Path to file containing index definition")
    parser.add_argument("data_file", help="Path to file containing json dump of documents to index")
    parser.add_argument("es_host", help="Elasticsearch Host to index against")
    parser.add_argument("es_port", help="Elasticsearch Host Transport Port")
    parser.add_argument("-r", "--replace", action="store_true", help="Overwrite index if it already exists")

    args = parser.parse_args()

    index_config_path = args.index_file
    documents_path = args.data_file
    host = args.es_host
    port = args.es_port

    overwrite = True if args.replace else False

    with open(index_config_path) as data_file:
        index_config = json.load(data_file)

    index_settings = index_config['index_settings']
    index_name = index_settings['indexname']
    doctypes = index_settings['doctypes']

    indexer = Indexer(host, port)
    indexer.create_index(index_name, index_settings['settings'], overwrite=overwrite)

    for doctype in doctypes:
        indexer.add_mappings(index_name, doctype, index_settings['mappings'][doctype])
        indexer.index_documents(documents_path, doctype, index_name)

Example #2

Show file

File: app.py Project: yusufyuksel96/university-projects

def command_index(directory):
    """
        Indexes data that from given directory again 
    """
    global dictionary
    global index
    Indexer.remove_index()
    # Set default data directory
    if directory is None:
        directory = 'reuters21578'
    print('Indexing ' + directory + ' folder...')
    Indexer.create_index(directory=directory)
    dictionary, index = Indexer.get_index()
    print('Index created')

Example #3

Show file

def main():
    parser = ArgumentParser()
    parser.add_argument("-p",
                        "--path",
                        dest="path",
                        default=None,
                        help="Document path")
    parser.add_argument("-t",
                        "--threads",
                        dest="threads",
                        help="Number of threads to launch")
    args = parser.parse_args()
    path = os.path.abspath(args.path) if args.path else "docs"
    threads = int(args.threads) if args.threads else 5
    indexer = Indexer(path, threads)
    indexer.create_index()

Example #4

Show file

File: index.py Project: benjdelt/indexer

from indexer import Indexer

# Instantiate Indexer wit path

index = Indexer("../TMP/")

# Create Index

# index.create_index()

# Create Index with filters

index.create_index(duplicates=True, max_size="315 KB")

# Filter By Duplicates

# duplicates = index.filter_duplicates()

# Write to file

index.write_to_file()

Example #5

Show file

File: crawler.py Project: tema5190/Arlin

class Crawler(object):
    def __init__(self):
        self.visited_url = set()
        self.root_url = None
        self.indexer = Indexer()

    def pass_robot_txt(self, url):
        robot = robotparser.RobotFileParser()
        robot.set_url(self.root_url)
        robot.read()

        return robot.can_fetch('*', url)

    def define_root_url(self, url):
        self.root_url = url

    def add_included_suburls(self, soup):

        urls = set()

        refs = soup.findAll('a')

        for ref in refs:
            try:
                href = ref['href']
            except Exception:
                print("Doesn't contains suburl")
                continue

            if len(href) < 2:
                continue

            if '//' in href:
                continue

            if href[0] != '/':
                continue

            if self.root_url in href:
                urls.add(href)

            urls.add(self.root_url + href)

        return urls

    def get_pair_word_and_count(self, soup):
        def visible(element):
            if element.parent.name in [
                    'head', 'script', 'style', '[document]'
            ]:
                return False

            if re.match('<--.*-->', str(element)):
                return False

            if element == '\n':
                return False

            return True

        data = soup.findAll(text=True)

        visible_text = filter(visible, data)
        words = list()

        for text in visible_text:
            result = re.findall(r'[0-9a-z]+', text.lower())

            for res in result:
                words.append(res)

        self.indexer.add_words(set(words))

        return Counter(words)

    def visit(self, url, width, depth):

        if depth < 0:
            return

        if not self.pass_robot_txt(url):
            raise Exception("robot.txt founded")

        current_url = url
        self.indexer.add_url(current_url)

        depth = depth - 1

        try:
            html = urllib2.urlopen(url).read()
        except Exception:
            print("Can't open this *** url")
            return

        soup = BeautifulSoup(html)

        urls = self.add_included_suburls(soup)

        for url in urls:
            if url in self.visited_url:
                continue

            if width == 0:
                break

            self.visited_url.add(url)
            width = width - 1
            self.visit(url, width, depth)

        words = self.get_pair_word_and_count(soup).iteritems()

        self.indexer.create_index(words, current_url)

    def run(self, url, width, depth):
        self.define_root_url(url)
        self.visit(url, width, depth)

Example #6

Show file

File: main.py Project: crizzy9/project_chatbot

from indexer import Indexer
from finder import Finder

indexer = Indexer()
indexer.create_index()
# print("Index:\n" + str(indexer.get_index()))
print("Indexing done...")

finder = Finder(indexer.get_index())

query = input("Enter the search keyword:")
finder.search(query)

Example #7

-1

Show file

File: crawler.py Project: polinatolmach/arlin

class Crawler(object):

    def __init__(self):
        self.visited_url = set()
        self.root_url = None
        self.indexer = Indexer()

    def pass_robot_txt(self,url):
        robot = robotparser.RobotFileParser()
        robot.set_url(self.root_url)
        robot.read()

        return robot.can_fetch('*',url)

    def define_root_url(self,url):
        self.root_url = url

    def add_included_suburls(self, soup):

        urls = set()

        refs = soup.findAll('a')


        for ref in refs:
            try:
                href = ref['href']
            except Exception:
                print("Doesn't contains suburl")
                continue

            if len(href) < 2:
                continue

            if '//' in href:
                continue

            if href[0] != '/':
                continue

            if self.root_url in href:
                urls.add(href)

            urls.add(self.root_url + href)

        return urls

    def get_pair_word_and_count(self, soup):

        def visible(element):
            if element.parent.name in ['head','script','style','[document]']:
                return False

            if re.match('<--.*-->',str(element)):
                return False

            if element == '\n':
                return False

            return True


        data = soup.findAll(text = True)

        visible_text = filter(visible, data)
        words = list()

        for text in visible_text:
            result = re.findall(r'[0-9a-z]+',text.lower())

            for res in result:
                words.append(res)

        self.indexer.add_words(set(words))

        return Counter(words)


    def visit(self, url, width, depth):

        if depth<0:
            return

        if not self.pass_robot_txt(url):
           raise Exception("robot.txt founded")

        current_url = url
        self.indexer.add_url(current_url)

        depth = depth - 1

        try:
            html = urllib2.urlopen(url).read()
        except Exception:
            print("Can't open this *** url")
            return

        soup = BeautifulSoup(html)

        urls = self.add_included_suburls(soup)

        for url in urls:
            if url in self.visited_url:
                continue

            if width == 0:
                break

            self.visited_url.add(url)
            width = width -1
            self.visit(url,width,depth)

        words = self.get_pair_word_and_count(soup).iteritems()

        self.indexer.create_index(words, current_url)

    def run(self,url,width,depth):
        self.define_root_url(url)
        self.visit(url,width,depth)