Example #1
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("index_file", help="Path to file containing index definition")
    parser.add_argument("data_file", help="Path to file containing json dump of documents to index")
    parser.add_argument("es_host", help="Elasticsearch Host to index against")
    parser.add_argument("es_port", help="Elasticsearch Host Transport Port")
    parser.add_argument("-r", "--replace", action="store_true", help="Overwrite index if it already exists")

    args = parser.parse_args()

    index_config_path = args.index_file
    documents_path = args.data_file
    host = args.es_host
    port = args.es_port

    overwrite = True if args.replace else False

    with open(index_config_path) as data_file:
        index_config = json.load(data_file)

    index_settings = index_config['index_settings']
    index_name = index_settings['indexname']
    doctypes = index_settings['doctypes']

    indexer = Indexer(host, port)
    indexer.create_index(index_name, index_settings['settings'], overwrite=overwrite)

    for doctype in doctypes:
        indexer.add_mappings(index_name, doctype, index_settings['mappings'][doctype])
        indexer.index_documents(documents_path, doctype, index_name)
Example #2
0
def command_index(directory):
    """
        Indexes data that from given directory again 
    """
    global dictionary
    global index
    Indexer.remove_index()
    # Set default data directory
    if directory is None:
        directory = 'reuters21578'
    print('Indexing ' + directory + ' folder...')
    Indexer.create_index(directory=directory)
    dictionary, index = Indexer.get_index()
    print('Index created')
Example #3
0
def main():
    parser = ArgumentParser()
    parser.add_argument("-p",
                        "--path",
                        dest="path",
                        default=None,
                        help="Document path")
    parser.add_argument("-t",
                        "--threads",
                        dest="threads",
                        help="Number of threads to launch")
    args = parser.parse_args()
    path = os.path.abspath(args.path) if args.path else "docs"
    threads = int(args.threads) if args.threads else 5
    indexer = Indexer(path, threads)
    indexer.create_index()
Example #4
0
from indexer import Indexer

# Instantiate Indexer wit path

index = Indexer("../TMP/")

# Create Index

# index.create_index()

# Create Index with filters

index.create_index(duplicates=True, max_size="315 KB")

# Filter By Duplicates

# duplicates = index.filter_duplicates()

# Write to file

index.write_to_file()
Example #5
0
class Crawler(object):
    def __init__(self):
        self.visited_url = set()
        self.root_url = None
        self.indexer = Indexer()

    def pass_robot_txt(self, url):
        robot = robotparser.RobotFileParser()
        robot.set_url(self.root_url)
        robot.read()

        return robot.can_fetch('*', url)

    def define_root_url(self, url):
        self.root_url = url

    def add_included_suburls(self, soup):

        urls = set()

        refs = soup.findAll('a')

        for ref in refs:
            try:
                href = ref['href']
            except Exception:
                print("Doesn't contains suburl")
                continue

            if len(href) < 2:
                continue

            if '//' in href:
                continue

            if href[0] != '/':
                continue

            if self.root_url in href:
                urls.add(href)

            urls.add(self.root_url + href)

        return urls

    def get_pair_word_and_count(self, soup):
        def visible(element):
            if element.parent.name in [
                    'head', 'script', 'style', '[document]'
            ]:
                return False

            if re.match('<--.*-->', str(element)):
                return False

            if element == '\n':
                return False

            return True

        data = soup.findAll(text=True)

        visible_text = filter(visible, data)
        words = list()

        for text in visible_text:
            result = re.findall(r'[0-9a-z]+', text.lower())

            for res in result:
                words.append(res)

        self.indexer.add_words(set(words))

        return Counter(words)

    def visit(self, url, width, depth):

        if depth < 0:
            return

        if not self.pass_robot_txt(url):
            raise Exception("robot.txt founded")

        current_url = url
        self.indexer.add_url(current_url)

        depth = depth - 1

        try:
            html = urllib2.urlopen(url).read()
        except Exception:
            print("Can't open this *** url")
            return

        soup = BeautifulSoup(html)

        urls = self.add_included_suburls(soup)

        for url in urls:
            if url in self.visited_url:
                continue

            if width == 0:
                break

            self.visited_url.add(url)
            width = width - 1
            self.visit(url, width, depth)

        words = self.get_pair_word_and_count(soup).iteritems()

        self.indexer.create_index(words, current_url)

    def run(self, url, width, depth):
        self.define_root_url(url)
        self.visit(url, width, depth)
Example #6
0
from indexer import Indexer
from finder import Finder

indexer = Indexer()
indexer.create_index()
# print("Index:\n" + str(indexer.get_index()))
print("Indexing done...")

finder = Finder(indexer.get_index())

query = input("Enter the search keyword:")
finder.search(query)
Example #7
-1
class Crawler(object):

    def __init__(self):
        self.visited_url = set()
        self.root_url = None
        self.indexer = Indexer()

    def pass_robot_txt(self,url):
        robot = robotparser.RobotFileParser()
        robot.set_url(self.root_url)
        robot.read()

        return robot.can_fetch('*',url)

    def define_root_url(self,url):
        self.root_url = url

    def add_included_suburls(self, soup):

        urls = set()

        refs = soup.findAll('a')


        for ref in refs:
            try:
                href = ref['href']
            except Exception:
                print("Doesn't contains suburl")
                continue

            if len(href) < 2:
                continue

            if '//' in href:
                continue

            if href[0] != '/':
                continue

            if self.root_url in href:
                urls.add(href)

            urls.add(self.root_url + href)

        return urls

    def get_pair_word_and_count(self, soup):

        def visible(element):
            if element.parent.name in ['head','script','style','[document]']:
                return False

            if re.match('<--.*-->',str(element)):
                return False

            if element == '\n':
                return False

            return True


        data = soup.findAll(text = True)

        visible_text = filter(visible, data)
        words = list()

        for text in visible_text:
            result = re.findall(r'[0-9a-z]+',text.lower())

            for res in result:
                words.append(res)

        self.indexer.add_words(set(words))

        return Counter(words)


    def visit(self, url, width, depth):

        if depth<0:
            return

        if not self.pass_robot_txt(url):
           raise Exception("robot.txt founded")

        current_url = url
        self.indexer.add_url(current_url)

        depth = depth - 1

        try:
            html = urllib2.urlopen(url).read()
        except Exception:
            print("Can't open this *** url")
            return

        soup = BeautifulSoup(html)

        urls = self.add_included_suburls(soup)

        for url in urls:
            if url in self.visited_url:
                continue

            if width == 0:
                break

            self.visited_url.add(url)
            width = width -1
            self.visit(url,width,depth)

        words = self.get_pair_word_and_count(soup).iteritems()

        self.indexer.create_index(words, current_url)

    def run(self,url,width,depth):
        self.define_root_url(url)
        self.visit(url,width,depth)