def createIndex(root): ''' Schema definition: title(name of file), path(as ID), content(indexed but not stored),textdata (stored text content) ''' schema = Schema(path = ID(stored = True), imgs = IDLIST(stored = True), image = TEXT(stored = True), content = TEXT(stored = True), date = DATETIME(sortable = True)) if not os.path.exists("index"): os.mkdir("index") ix = create_in("index", schema) writer = ix.writer() root = "/home/httpd/nosher.net/docs/archives/computers" files = [os.path.join(root, i) for i in os.listdir(root)] for f in files: if f[-4:] == ".txt": with open(f, "r") as fh: print (f) lines = fh.readlines() text = " ".join(lines).replace("\n", "") text = re.sub("<.*?>|\[.*?\]", "", text) img = "{}/archives/computers/images/{}-s.jpg".format(WEBROOT, f.split("/")[-1].replace(".txt", "")) url = "archives/computers/{}".format(f.split("/")[-1].replace(".txt", "")) writer.add_document(path = url, content = text, image = img) fh.close() root = "/home/httpd/nosher.net/docs/images/" for path, folder, files in os.walk(root): for f in files: if f == "details.txt": full = os.path.join(path, f) print (full) try: timestamp = datetime.strptime(path.split("/")[-1][0:10], "%Y-%m-%d") except ValueError: timestamp = datetime.strptime("1989-10-01", "%Y-%m-%d") with open(full, "r") as fh: try: first = 0 text = fh.readlines() if len(text) > 3: for i in range(0, len(text)): try: parts = text[i].split("\t") if len(parts) > 1: # convert to URL path webpath = path.replace("/home/httpd/nosher.net/docs/", "") # this is an album description match if parts[0] == "title" or parts[0] == "intro": images = [] if len(text) > 7: for j in range(3, 7): bits = text[j].split("\t") images.append(bits[0]) img = "{}/{}/{}-s.jpg".format(WEBROOT, webpath, text[3].split("\t")[0]) writer.add_document(path = webpath, imgs = ",".join(images), content = parts[1], image = img, date = timestamp) elif parts[0] == "locn": pass else: # this is a match to an individual photo if first == 0: first = i img = "{}/{}/{}-s.jpg".format(WEBROOT, webpath, parts[0]) writer.add_document(path = "{}/{}".format(webpath, i - first), content = parts[1], image = img, date = timestamp) except ValueError: print (path, l) except UnicodeDecodeError: print (full + " failed") writer.commit()
def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=ChineseAnalyer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields))
def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: # schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True) schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields))